Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
dc17f2a1
Unverified
Commit
dc17f2a1
authored
Jan 16, 2020
by
Thomas Wolf
Committed by
GitHub
Jan 16, 2020
Browse files
Merge pull request #2538 from huggingface/py3_super
💄
super
parents
88085484
a98b2ca8
Changes
75
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
177 additions
and
179 deletions
+177
-179
src/transformers/modeling_gpt2.py
src/transformers/modeling_gpt2.py
+7
-7
src/transformers/modeling_mmbt.py
src/transformers/modeling_mmbt.py
+3
-3
src/transformers/modeling_openai.py
src/transformers/modeling_openai.py
+6
-6
src/transformers/modeling_roberta.py
src/transformers/modeling_roberta.py
+10
-10
src/transformers/modeling_t5.py
src/transformers/modeling_t5.py
+10
-10
src/transformers/modeling_tf_albert.py
src/transformers/modeling_tf_albert.py
+13
-13
src/transformers/modeling_tf_bert.py
src/transformers/modeling_tf_bert.py
+24
-24
src/transformers/modeling_tf_ctrl.py
src/transformers/modeling_tf_ctrl.py
+7
-7
src/transformers/modeling_tf_distilbert.py
src/transformers/modeling_tf_distilbert.py
+14
-14
src/transformers/modeling_tf_gpt2.py
src/transformers/modeling_tf_gpt2.py
+7
-7
src/transformers/modeling_tf_openai.py
src/transformers/modeling_tf_openai.py
+7
-7
src/transformers/modeling_tf_roberta.py
src/transformers/modeling_tf_roberta.py
+10
-12
src/transformers/modeling_tf_t5.py
src/transformers/modeling_tf_t5.py
+11
-11
src/transformers/modeling_tf_transfo_xl.py
src/transformers/modeling_tf_transfo_xl.py
+11
-11
src/transformers/modeling_tf_transfo_xl_utilities.py
src/transformers/modeling_tf_transfo_xl_utilities.py
+2
-2
src/transformers/modeling_tf_utils.py
src/transformers/modeling_tf_utils.py
+5
-5
src/transformers/modeling_tf_xlm.py
src/transformers/modeling_tf_xlm.py
+9
-9
src/transformers/modeling_tf_xlnet.py
src/transformers/modeling_tf_xlnet.py
+13
-13
src/transformers/modeling_transfo_xl.py
src/transformers/modeling_transfo_xl.py
+7
-7
src/transformers/modeling_transfo_xl_utilities.py
src/transformers/modeling_transfo_xl_utilities.py
+1
-1
No files found.
src/transformers/modeling_gpt2.py
View file @
dc17f2a1
...
@@ -101,7 +101,7 @@ def gelu(x):
...
@@ -101,7 +101,7 @@ def gelu(x):
class
Attention
(
nn
.
Module
):
class
Attention
(
nn
.
Module
):
def
__init__
(
self
,
nx
,
n_ctx
,
config
,
scale
=
False
):
def
__init__
(
self
,
nx
,
n_ctx
,
config
,
scale
=
False
):
super
(
Attention
,
self
).
__init__
()
super
().
__init__
()
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
n_state
=
nx
# in Attention: n_state=768 (nx=n_embd)
n_state
=
nx
# in Attention: n_state=768 (nx=n_embd)
...
@@ -202,7 +202,7 @@ class Attention(nn.Module):
...
@@ -202,7 +202,7 @@ class Attention(nn.Module):
class
MLP
(
nn
.
Module
):
class
MLP
(
nn
.
Module
):
def
__init__
(
self
,
n_state
,
config
):
# in MLP: n_state=3072 (4 * n_embd)
def
__init__
(
self
,
n_state
,
config
):
# in MLP: n_state=3072 (4 * n_embd)
super
(
MLP
,
self
).
__init__
()
super
().
__init__
()
nx
=
config
.
n_embd
nx
=
config
.
n_embd
self
.
c_fc
=
Conv1D
(
n_state
,
nx
)
self
.
c_fc
=
Conv1D
(
n_state
,
nx
)
self
.
c_proj
=
Conv1D
(
nx
,
n_state
)
self
.
c_proj
=
Conv1D
(
nx
,
n_state
)
...
@@ -217,7 +217,7 @@ class MLP(nn.Module):
...
@@ -217,7 +217,7 @@ class MLP(nn.Module):
class
Block
(
nn
.
Module
):
class
Block
(
nn
.
Module
):
def
__init__
(
self
,
n_ctx
,
config
,
scale
=
False
):
def
__init__
(
self
,
n_ctx
,
config
,
scale
=
False
):
super
(
Block
,
self
).
__init__
()
super
().
__init__
()
nx
=
config
.
n_embd
nx
=
config
.
n_embd
self
.
ln_1
=
nn
.
LayerNorm
(
nx
,
eps
=
config
.
layer_norm_epsilon
)
self
.
ln_1
=
nn
.
LayerNorm
(
nx
,
eps
=
config
.
layer_norm_epsilon
)
self
.
attn
=
Attention
(
nx
,
n_ctx
,
config
,
scale
)
self
.
attn
=
Attention
(
nx
,
n_ctx
,
config
,
scale
)
...
@@ -249,7 +249,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
...
@@ -249,7 +249,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
base_model_prefix
=
"transformer"
base_model_prefix
=
"transformer"
def
__init__
(
self
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
*
inputs
,
**
kwargs
):
super
(
GPT2PreTrainedModel
,
self
).
__init__
(
*
inputs
,
**
kwargs
)
super
().
__init__
(
*
inputs
,
**
kwargs
)
def
_init_weights
(
self
,
module
):
def
_init_weights
(
self
,
module
):
""" Initialize the weights.
""" Initialize the weights.
...
@@ -355,7 +355,7 @@ class GPT2Model(GPT2PreTrainedModel):
...
@@ -355,7 +355,7 @@ class GPT2Model(GPT2PreTrainedModel):
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
GPT2Model
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_past
=
config
.
output_past
self
.
output_past
=
config
.
output_past
...
@@ -550,7 +550,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
...
@@ -550,7 +550,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
GPT2LMHeadModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
transformer
=
GPT2Model
(
config
)
self
.
transformer
=
GPT2Model
(
config
)
self
.
lm_head
=
nn
.
Linear
(
config
.
n_embd
,
config
.
vocab_size
,
bias
=
False
)
self
.
lm_head
=
nn
.
Linear
(
config
.
n_embd
,
config
.
vocab_size
,
bias
=
False
)
...
@@ -678,7 +678,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
...
@@ -678,7 +678,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
GPT2DoubleHeadsModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
config
.
num_labels
=
1
config
.
num_labels
=
1
self
.
transformer
=
GPT2Model
(
config
)
self
.
transformer
=
GPT2Model
(
config
)
self
.
lm_head
=
nn
.
Linear
(
config
.
n_embd
,
config
.
vocab_size
,
bias
=
False
)
self
.
lm_head
=
nn
.
Linear
(
config
.
n_embd
,
config
.
vocab_size
,
bias
=
False
)
...
...
src/transformers/modeling_mmbt.py
View file @
dc17f2a1
...
@@ -33,7 +33,7 @@ class ModalEmbeddings(nn.Module):
...
@@ -33,7 +33,7 @@ class ModalEmbeddings(nn.Module):
"""
"""
def
__init__
(
self
,
config
,
encoder
,
embeddings
):
def
__init__
(
self
,
config
,
encoder
,
embeddings
):
super
(
ModalEmbeddings
,
self
).
__init__
()
super
().
__init__
()
self
.
config
=
config
self
.
config
=
config
self
.
encoder
=
encoder
self
.
encoder
=
encoder
self
.
proj_embeddings
=
nn
.
Linear
(
config
.
modal_hidden_size
,
config
.
hidden_size
)
self
.
proj_embeddings
=
nn
.
Linear
(
config
.
modal_hidden_size
,
config
.
hidden_size
)
...
@@ -175,7 +175,7 @@ class MMBTModel(nn.Module):
...
@@ -175,7 +175,7 @@ class MMBTModel(nn.Module):
"""
"""
def
__init__
(
self
,
config
,
transformer
,
encoder
):
def
__init__
(
self
,
config
,
transformer
,
encoder
):
super
(
MMBTModel
,
self
).
__init__
()
super
().
__init__
()
self
.
config
=
config
self
.
config
=
config
self
.
transformer
=
transformer
self
.
transformer
=
transformer
self
.
modal_encoder
=
ModalEmbeddings
(
config
,
encoder
,
transformer
.
embeddings
)
self
.
modal_encoder
=
ModalEmbeddings
(
config
,
encoder
,
transformer
.
embeddings
)
...
@@ -359,7 +359,7 @@ class MMBTForClassification(nn.Module):
...
@@ -359,7 +359,7 @@ class MMBTForClassification(nn.Module):
"""
"""
def
__init__
(
self
,
config
,
transformer
,
encoder
):
def
__init__
(
self
,
config
,
transformer
,
encoder
):
super
(
MMBTForClassification
,
self
).
__init__
()
super
().
__init__
()
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
mmbt
=
MMBTModel
(
config
,
transformer
,
encoder
)
self
.
mmbt
=
MMBTModel
(
config
,
transformer
,
encoder
)
...
...
src/transformers/modeling_openai.py
View file @
dc17f2a1
...
@@ -127,7 +127,7 @@ ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
...
@@ -127,7 +127,7 @@ ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
class
Attention
(
nn
.
Module
):
class
Attention
(
nn
.
Module
):
def
__init__
(
self
,
nx
,
n_ctx
,
config
,
scale
=
False
):
def
__init__
(
self
,
nx
,
n_ctx
,
config
,
scale
=
False
):
super
(
Attention
,
self
).
__init__
()
super
().
__init__
()
n_state
=
nx
# in Attention: n_state=768 (nx=n_embd)
n_state
=
nx
# in Attention: n_state=768 (nx=n_embd)
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
assert
n_state
%
config
.
n_head
==
0
assert
n_state
%
config
.
n_head
==
0
...
@@ -221,7 +221,7 @@ class Attention(nn.Module):
...
@@ -221,7 +221,7 @@ class Attention(nn.Module):
class
MLP
(
nn
.
Module
):
class
MLP
(
nn
.
Module
):
def
__init__
(
self
,
n_state
,
config
):
# in MLP: n_state=3072 (4 * n_embd)
def
__init__
(
self
,
n_state
,
config
):
# in MLP: n_state=3072 (4 * n_embd)
super
(
MLP
,
self
).
__init__
()
super
().
__init__
()
nx
=
config
.
n_embd
nx
=
config
.
n_embd
self
.
c_fc
=
Conv1D
(
n_state
,
nx
)
self
.
c_fc
=
Conv1D
(
n_state
,
nx
)
self
.
c_proj
=
Conv1D
(
nx
,
n_state
)
self
.
c_proj
=
Conv1D
(
nx
,
n_state
)
...
@@ -236,7 +236,7 @@ class MLP(nn.Module):
...
@@ -236,7 +236,7 @@ class MLP(nn.Module):
class
Block
(
nn
.
Module
):
class
Block
(
nn
.
Module
):
def
__init__
(
self
,
n_ctx
,
config
,
scale
=
False
):
def
__init__
(
self
,
n_ctx
,
config
,
scale
=
False
):
super
(
Block
,
self
).
__init__
()
super
().
__init__
()
nx
=
config
.
n_embd
nx
=
config
.
n_embd
self
.
attn
=
Attention
(
nx
,
n_ctx
,
config
,
scale
)
self
.
attn
=
Attention
(
nx
,
n_ctx
,
config
,
scale
)
self
.
ln_1
=
nn
.
LayerNorm
(
nx
,
eps
=
config
.
layer_norm_epsilon
)
self
.
ln_1
=
nn
.
LayerNorm
(
nx
,
eps
=
config
.
layer_norm_epsilon
)
...
@@ -359,7 +359,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
...
@@ -359,7 +359,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
OpenAIGPTModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
...
@@ -518,7 +518,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
...
@@ -518,7 +518,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
OpenAIGPTLMHeadModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
transformer
=
OpenAIGPTModel
(
config
)
self
.
transformer
=
OpenAIGPTModel
(
config
)
self
.
lm_head
=
nn
.
Linear
(
config
.
n_embd
,
config
.
vocab_size
,
bias
=
False
)
self
.
lm_head
=
nn
.
Linear
(
config
.
n_embd
,
config
.
vocab_size
,
bias
=
False
)
...
@@ -623,7 +623,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
...
@@ -623,7 +623,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
OpenAIGPTDoubleHeadsModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
config
.
num_labels
=
1
config
.
num_labels
=
1
self
.
transformer
=
OpenAIGPTModel
(
config
)
self
.
transformer
=
OpenAIGPTModel
(
config
)
...
...
src/transformers/modeling_roberta.py
View file @
dc17f2a1
...
@@ -45,7 +45,7 @@ class RobertaEmbeddings(BertEmbeddings):
...
@@ -45,7 +45,7 @@ class RobertaEmbeddings(BertEmbeddings):
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
RobertaEmbeddings
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
padding_idx
=
1
self
.
padding_idx
=
1
self
.
word_embeddings
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
hidden_size
,
padding_idx
=
self
.
padding_idx
)
self
.
word_embeddings
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
hidden_size
,
padding_idx
=
self
.
padding_idx
)
self
.
position_embeddings
=
nn
.
Embedding
(
self
.
position_embeddings
=
nn
.
Embedding
(
...
@@ -60,7 +60,7 @@ class RobertaEmbeddings(BertEmbeddings):
...
@@ -60,7 +60,7 @@ class RobertaEmbeddings(BertEmbeddings):
else
:
else
:
position_ids
=
self
.
create_position_ids_from_inputs_embeds
(
inputs_embeds
)
position_ids
=
self
.
create_position_ids_from_inputs_embeds
(
inputs_embeds
)
return
super
(
RobertaEmbeddings
,
self
).
forward
(
return
super
().
forward
(
input_ids
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
inputs_embeds
=
inputs_embeds
input_ids
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
inputs_embeds
=
inputs_embeds
)
)
...
@@ -204,7 +204,7 @@ class RobertaModel(BertModel):
...
@@ -204,7 +204,7 @@ class RobertaModel(BertModel):
base_model_prefix
=
"roberta"
base_model_prefix
=
"roberta"
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
RobertaModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
embeddings
=
RobertaEmbeddings
(
config
)
self
.
embeddings
=
RobertaEmbeddings
(
config
)
self
.
init_weights
()
self
.
init_weights
()
...
@@ -254,7 +254,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
...
@@ -254,7 +254,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
base_model_prefix
=
"roberta"
base_model_prefix
=
"roberta"
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
RobertaForMaskedLM
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
roberta
=
RobertaModel
(
config
)
self
.
roberta
=
RobertaModel
(
config
)
self
.
lm_head
=
RobertaLMHead
(
config
)
self
.
lm_head
=
RobertaLMHead
(
config
)
...
@@ -299,7 +299,7 @@ class RobertaLMHead(nn.Module):
...
@@ -299,7 +299,7 @@ class RobertaLMHead(nn.Module):
"""Roberta Head for masked language modeling."""
"""Roberta Head for masked language modeling."""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
RobertaLMHead
,
self
).
__init__
()
super
().
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
layer_norm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
self
.
layer_norm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
...
@@ -362,7 +362,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
...
@@ -362,7 +362,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
base_model_prefix
=
"roberta"
base_model_prefix
=
"roberta"
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
RobertaForSequenceClassification
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
roberta
=
RobertaModel
(
config
)
self
.
roberta
=
RobertaModel
(
config
)
...
@@ -484,7 +484,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
...
@@ -484,7 +484,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
base_model_prefix
=
"roberta"
base_model_prefix
=
"roberta"
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
RobertaForMultipleChoice
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
roberta
=
RobertaModel
(
config
)
self
.
roberta
=
RobertaModel
(
config
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
...
@@ -571,7 +571,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
...
@@ -571,7 +571,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
base_model_prefix
=
"roberta"
base_model_prefix
=
"roberta"
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
RobertaForTokenClassification
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
roberta
=
RobertaModel
(
config
)
self
.
roberta
=
RobertaModel
(
config
)
...
@@ -625,7 +625,7 @@ class RobertaClassificationHead(nn.Module):
...
@@ -625,7 +625,7 @@ class RobertaClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
"""Head for sentence-level classification tasks."""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
RobertaClassificationHead
,
self
).
__init__
()
super
().
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
out_proj
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
num_labels
)
self
.
out_proj
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
num_labels
)
...
@@ -684,7 +684,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
...
@@ -684,7 +684,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
base_model_prefix
=
"roberta"
base_model_prefix
=
"roberta"
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
RobertaForQuestionAnswering
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
roberta
=
RobertaModel
(
config
)
self
.
roberta
=
RobertaModel
(
config
)
...
...
src/transformers/modeling_t5.py
View file @
dc17f2a1
...
@@ -142,7 +142,7 @@ class T5LayerNorm(nn.Module):
...
@@ -142,7 +142,7 @@ class T5LayerNorm(nn.Module):
""" Construct a layernorm module in the T5 style
""" Construct a layernorm module in the T5 style
No bias and no substraction of mean.
No bias and no substraction of mean.
"""
"""
super
(
T5LayerNorm
,
self
).
__init__
()
super
().
__init__
()
self
.
weight
=
nn
.
Parameter
(
torch
.
ones
(
hidden_size
))
self
.
weight
=
nn
.
Parameter
(
torch
.
ones
(
hidden_size
))
self
.
variance_epsilon
=
eps
self
.
variance_epsilon
=
eps
...
@@ -154,7 +154,7 @@ class T5LayerNorm(nn.Module):
...
@@ -154,7 +154,7 @@ class T5LayerNorm(nn.Module):
class
T5DenseReluDense
(
nn
.
Module
):
class
T5DenseReluDense
(
nn
.
Module
):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
T5DenseReluDense
,
self
).
__init__
()
super
().
__init__
()
self
.
wi
=
nn
.
Linear
(
config
.
d_model
,
config
.
d_ff
,
bias
=
False
)
self
.
wi
=
nn
.
Linear
(
config
.
d_model
,
config
.
d_ff
,
bias
=
False
)
self
.
wo
=
nn
.
Linear
(
config
.
d_ff
,
config
.
d_model
,
bias
=
False
)
self
.
wo
=
nn
.
Linear
(
config
.
d_ff
,
config
.
d_model
,
bias
=
False
)
self
.
dropout
=
nn
.
Dropout
(
config
.
dropout_rate
)
self
.
dropout
=
nn
.
Dropout
(
config
.
dropout_rate
)
...
@@ -169,7 +169,7 @@ class T5DenseReluDense(nn.Module):
...
@@ -169,7 +169,7 @@ class T5DenseReluDense(nn.Module):
class
T5LayerFF
(
nn
.
Module
):
class
T5LayerFF
(
nn
.
Module
):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
T5LayerFF
,
self
).
__init__
()
super
().
__init__
()
self
.
DenseReluDense
=
T5DenseReluDense
(
config
)
self
.
DenseReluDense
=
T5DenseReluDense
(
config
)
self
.
layer_norm
=
T5LayerNorm
(
config
.
d_model
,
eps
=
config
.
layer_norm_epsilon
)
self
.
layer_norm
=
T5LayerNorm
(
config
.
d_model
,
eps
=
config
.
layer_norm_epsilon
)
self
.
dropout
=
nn
.
Dropout
(
config
.
dropout_rate
)
self
.
dropout
=
nn
.
Dropout
(
config
.
dropout_rate
)
...
@@ -185,7 +185,7 @@ class T5Attention(nn.Module):
...
@@ -185,7 +185,7 @@ class T5Attention(nn.Module):
NEW_ID
=
itertools
.
count
()
NEW_ID
=
itertools
.
count
()
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
):
super
(
T5Attention
,
self
).
__init__
()
super
().
__init__
()
self
.
layer_id
=
next
(
T5Attention
.
NEW_ID
)
self
.
layer_id
=
next
(
T5Attention
.
NEW_ID
)
self
.
is_decoder
=
config
.
is_decoder
self
.
is_decoder
=
config
.
is_decoder
self
.
has_relative_attention_bias
=
has_relative_attention_bias
self
.
has_relative_attention_bias
=
has_relative_attention_bias
...
@@ -363,7 +363,7 @@ class T5Attention(nn.Module):
...
@@ -363,7 +363,7 @@ class T5Attention(nn.Module):
class
T5LayerSelfAttention
(
nn
.
Module
):
class
T5LayerSelfAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
):
super
(
T5LayerSelfAttention
,
self
).
__init__
()
super
().
__init__
()
self
.
SelfAttention
=
T5Attention
(
config
,
has_relative_attention_bias
=
has_relative_attention_bias
)
self
.
SelfAttention
=
T5Attention
(
config
,
has_relative_attention_bias
=
has_relative_attention_bias
)
self
.
layer_norm
=
T5LayerNorm
(
config
.
d_model
,
eps
=
config
.
layer_norm_epsilon
)
self
.
layer_norm
=
T5LayerNorm
(
config
.
d_model
,
eps
=
config
.
layer_norm_epsilon
)
self
.
dropout
=
nn
.
Dropout
(
config
.
dropout_rate
)
self
.
dropout
=
nn
.
Dropout
(
config
.
dropout_rate
)
...
@@ -381,7 +381,7 @@ class T5LayerSelfAttention(nn.Module):
...
@@ -381,7 +381,7 @@ class T5LayerSelfAttention(nn.Module):
class
T5LayerCrossAttention
(
nn
.
Module
):
class
T5LayerCrossAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
):
super
(
T5LayerCrossAttention
,
self
).
__init__
()
super
().
__init__
()
self
.
EncDecAttention
=
T5Attention
(
config
,
has_relative_attention_bias
=
has_relative_attention_bias
)
self
.
EncDecAttention
=
T5Attention
(
config
,
has_relative_attention_bias
=
has_relative_attention_bias
)
self
.
layer_norm
=
T5LayerNorm
(
config
.
d_model
,
eps
=
config
.
layer_norm_epsilon
)
self
.
layer_norm
=
T5LayerNorm
(
config
.
d_model
,
eps
=
config
.
layer_norm_epsilon
)
self
.
dropout
=
nn
.
Dropout
(
config
.
dropout_rate
)
self
.
dropout
=
nn
.
Dropout
(
config
.
dropout_rate
)
...
@@ -399,7 +399,7 @@ class T5LayerCrossAttention(nn.Module):
...
@@ -399,7 +399,7 @@ class T5LayerCrossAttention(nn.Module):
class
T5Block
(
nn
.
Module
):
class
T5Block
(
nn
.
Module
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
):
super
(
T5Block
,
self
).
__init__
()
super
().
__init__
()
self
.
is_decoder
=
config
.
is_decoder
self
.
is_decoder
=
config
.
is_decoder
self
.
layer
=
nn
.
ModuleList
()
self
.
layer
=
nn
.
ModuleList
()
self
.
layer
.
append
(
T5LayerSelfAttention
(
config
,
has_relative_attention_bias
=
has_relative_attention_bias
))
self
.
layer
.
append
(
T5LayerSelfAttention
(
config
,
has_relative_attention_bias
=
has_relative_attention_bias
))
...
@@ -501,7 +501,7 @@ class T5PreTrainedModel(PreTrainedModel):
...
@@ -501,7 +501,7 @@ class T5PreTrainedModel(PreTrainedModel):
class
T5Stack
(
T5PreTrainedModel
):
class
T5Stack
(
T5PreTrainedModel
):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
T5Stack
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
is_decoder
=
config
.
is_decoder
self
.
is_decoder
=
config
.
is_decoder
...
@@ -724,7 +724,7 @@ class T5Model(T5PreTrainedModel):
...
@@ -724,7 +724,7 @@ class T5Model(T5PreTrainedModel):
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
T5Model
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
shared
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
d_model
)
self
.
shared
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
d_model
)
encoder_config
=
copy
.
deepcopy
(
config
)
encoder_config
=
copy
.
deepcopy
(
config
)
...
@@ -830,7 +830,7 @@ class T5WithLMHeadModel(T5PreTrainedModel):
...
@@ -830,7 +830,7 @@ class T5WithLMHeadModel(T5PreTrainedModel):
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
T5WithLMHeadModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
model_dim
=
config
.
d_model
self
.
model_dim
=
config
.
d_model
self
.
shared
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
d_model
)
self
.
shared
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
d_model
)
...
...
src/transformers/modeling_tf_albert.py
View file @
dc17f2a1
...
@@ -45,7 +45,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
...
@@ -45,7 +45,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
"""
"""
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFAlbertEmbeddings
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
config
=
config
self
.
config
=
config
self
.
position_embeddings
=
tf
.
keras
.
layers
.
Embedding
(
self
.
position_embeddings
=
tf
.
keras
.
layers
.
Embedding
(
...
@@ -76,7 +76,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
...
@@ -76,7 +76,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
shape
=
[
self
.
config
.
vocab_size
,
self
.
config
.
embedding_size
],
shape
=
[
self
.
config
.
vocab_size
,
self
.
config
.
embedding_size
],
initializer
=
get_initializer
(
self
.
config
.
initializer_range
),
initializer
=
get_initializer
(
self
.
config
.
initializer_range
),
)
)
super
(
TFAlbertEmbeddings
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
inputs
,
mode
=
"embedding"
,
training
=
False
):
def
call
(
self
,
inputs
,
mode
=
"embedding"
,
training
=
False
):
"""Get token embeddings of inputs.
"""Get token embeddings of inputs.
...
@@ -141,7 +141,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
...
@@ -141,7 +141,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
class
TFAlbertSelfAttention
(
tf
.
keras
.
layers
.
Layer
):
class
TFAlbertSelfAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFAlbertSelfAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
if
config
.
hidden_size
%
config
.
num_attention_heads
!=
0
:
if
config
.
hidden_size
%
config
.
num_attention_heads
!=
0
:
raise
ValueError
(
raise
ValueError
(
"The hidden size (%d) is not a multiple of the number of attention "
"The hidden size (%d) is not a multiple of the number of attention "
...
@@ -217,7 +217,7 @@ class TFAlbertSelfAttention(tf.keras.layers.Layer):
...
@@ -217,7 +217,7 @@ class TFAlbertSelfAttention(tf.keras.layers.Layer):
class
TFAlbertSelfOutput
(
tf
.
keras
.
layers
.
Layer
):
class
TFAlbertSelfOutput
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFAlbertSelfOutput
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
)
)
...
@@ -235,7 +235,7 @@ class TFAlbertSelfOutput(tf.keras.layers.Layer):
...
@@ -235,7 +235,7 @@ class TFAlbertSelfOutput(tf.keras.layers.Layer):
class
TFAlbertAttention
(
TFBertSelfAttention
):
class
TFAlbertAttention
(
TFBertSelfAttention
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFAlbertAttention
,
self
).
__init__
(
config
,
**
kwargs
)
super
().
__init__
(
config
,
**
kwargs
)
self
.
hidden_size
=
config
.
hidden_size
self
.
hidden_size
=
config
.
hidden_size
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
...
@@ -303,7 +303,7 @@ class TFAlbertAttention(TFBertSelfAttention):
...
@@ -303,7 +303,7 @@ class TFAlbertAttention(TFBertSelfAttention):
class
TFAlbertLayer
(
tf
.
keras
.
layers
.
Layer
):
class
TFAlbertLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFAlbertLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
attention
=
TFAlbertAttention
(
config
,
name
=
"attention"
)
self
.
attention
=
TFAlbertAttention
(
config
,
name
=
"attention"
)
self
.
ffn
=
tf
.
keras
.
layers
.
Dense
(
self
.
ffn
=
tf
.
keras
.
layers
.
Dense
(
...
@@ -341,7 +341,7 @@ class TFAlbertLayer(tf.keras.layers.Layer):
...
@@ -341,7 +341,7 @@ class TFAlbertLayer(tf.keras.layers.Layer):
class
TFAlbertLayerGroup
(
tf
.
keras
.
layers
.
Layer
):
class
TFAlbertLayerGroup
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFAlbertLayerGroup
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
...
@@ -376,7 +376,7 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer):
...
@@ -376,7 +376,7 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer):
class
TFAlbertTransformer
(
tf
.
keras
.
layers
.
Layer
):
class
TFAlbertTransformer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFAlbertTransformer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
config
=
config
self
.
config
=
config
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
...
@@ -445,7 +445,7 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel):
...
@@ -445,7 +445,7 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel):
class
TFAlbertMLMHead
(
tf
.
keras
.
layers
.
Layer
):
class
TFAlbertMLMHead
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFAlbertMLMHead
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
...
@@ -467,7 +467,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
...
@@ -467,7 +467,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
self
.
decoder_bias
=
self
.
add_weight
(
self
.
decoder_bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"decoder/bias"
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"decoder/bias"
)
)
super
(
TFAlbertMLMHead
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
hidden_states
):
def
call
(
self
,
hidden_states
):
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
self
.
dense
(
hidden_states
)
...
@@ -596,7 +596,7 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
...
@@ -596,7 +596,7 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFAlbertModel
,
self
).
__init__
(
config
,
**
kwargs
)
super
().
__init__
(
config
,
**
kwargs
)
self
.
num_hidden_layers
=
config
.
num_hidden_layers
self
.
num_hidden_layers
=
config
.
num_hidden_layers
self
.
embeddings
=
TFAlbertEmbeddings
(
config
,
name
=
"embeddings"
)
self
.
embeddings
=
TFAlbertEmbeddings
(
config
,
name
=
"embeddings"
)
...
@@ -733,7 +733,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
...
@@ -733,7 +733,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFAlbertForMaskedLM
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
albert
=
TFAlbertModel
(
config
,
name
=
"albert"
)
self
.
albert
=
TFAlbertModel
(
config
,
name
=
"albert"
)
self
.
predictions
=
TFAlbertMLMHead
(
config
,
self
.
albert
.
embeddings
,
name
=
"predictions"
)
self
.
predictions
=
TFAlbertMLMHead
(
config
,
self
.
albert
.
embeddings
,
name
=
"predictions"
)
...
@@ -786,7 +786,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
...
@@ -786,7 +786,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFAlbertForSequenceClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
albert
=
TFAlbertModel
(
config
,
name
=
"albert"
)
self
.
albert
=
TFAlbertModel
(
config
,
name
=
"albert"
)
...
...
src/transformers/modeling_tf_bert.py
View file @
dc17f2a1
...
@@ -93,7 +93,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
...
@@ -93,7 +93,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
"""
"""
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertEmbeddings
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
hidden_size
=
config
.
hidden_size
self
.
hidden_size
=
config
.
hidden_size
self
.
initializer_range
=
config
.
initializer_range
self
.
initializer_range
=
config
.
initializer_range
...
@@ -126,7 +126,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
...
@@ -126,7 +126,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
shape
=
[
self
.
vocab_size
,
self
.
hidden_size
],
shape
=
[
self
.
vocab_size
,
self
.
hidden_size
],
initializer
=
get_initializer
(
self
.
initializer_range
),
initializer
=
get_initializer
(
self
.
initializer_range
),
)
)
super
(
TFBertEmbeddings
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
inputs
,
mode
=
"embedding"
,
training
=
False
):
def
call
(
self
,
inputs
,
mode
=
"embedding"
,
training
=
False
):
"""Get token embeddings of inputs.
"""Get token embeddings of inputs.
...
@@ -193,7 +193,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
...
@@ -193,7 +193,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
class
TFBertSelfAttention
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertSelfAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertSelfAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
if
config
.
hidden_size
%
config
.
num_attention_heads
!=
0
:
if
config
.
hidden_size
%
config
.
num_attention_heads
!=
0
:
raise
ValueError
(
raise
ValueError
(
"The hidden size (%d) is not a multiple of the number of attention "
"The hidden size (%d) is not a multiple of the number of attention "
...
@@ -269,7 +269,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
...
@@ -269,7 +269,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
class
TFBertSelfOutput
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertSelfOutput
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertSelfOutput
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
)
)
...
@@ -287,7 +287,7 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
...
@@ -287,7 +287,7 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
class
TFBertAttention
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
self_attention
=
TFBertSelfAttention
(
config
,
name
=
"self"
)
self
.
self_attention
=
TFBertSelfAttention
(
config
,
name
=
"self"
)
self
.
dense_output
=
TFBertSelfOutput
(
config
,
name
=
"output"
)
self
.
dense_output
=
TFBertSelfOutput
(
config
,
name
=
"output"
)
...
@@ -305,7 +305,7 @@ class TFBertAttention(tf.keras.layers.Layer):
...
@@ -305,7 +305,7 @@ class TFBertAttention(tf.keras.layers.Layer):
class
TFBertIntermediate
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertIntermediate
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertIntermediate
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
intermediate_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
config
.
intermediate_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
)
)
...
@@ -322,7 +322,7 @@ class TFBertIntermediate(tf.keras.layers.Layer):
...
@@ -322,7 +322,7 @@ class TFBertIntermediate(tf.keras.layers.Layer):
class
TFBertOutput
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertOutput
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertOutput
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
)
)
...
@@ -340,7 +340,7 @@ class TFBertOutput(tf.keras.layers.Layer):
...
@@ -340,7 +340,7 @@ class TFBertOutput(tf.keras.layers.Layer):
class
TFBertLayer
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
attention
=
TFBertAttention
(
config
,
name
=
"attention"
)
self
.
attention
=
TFBertAttention
(
config
,
name
=
"attention"
)
self
.
intermediate
=
TFBertIntermediate
(
config
,
name
=
"intermediate"
)
self
.
intermediate
=
TFBertIntermediate
(
config
,
name
=
"intermediate"
)
self
.
bert_output
=
TFBertOutput
(
config
,
name
=
"output"
)
self
.
bert_output
=
TFBertOutput
(
config
,
name
=
"output"
)
...
@@ -358,7 +358,7 @@ class TFBertLayer(tf.keras.layers.Layer):
...
@@ -358,7 +358,7 @@ class TFBertLayer(tf.keras.layers.Layer):
class
TFBertEncoder
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertEncoder
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertEncoder
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
layer
=
[
TFBertLayer
(
config
,
name
=
"layer_._{}"
.
format
(
i
))
for
i
in
range
(
config
.
num_hidden_layers
)]
self
.
layer
=
[
TFBertLayer
(
config
,
name
=
"layer_._{}"
.
format
(
i
))
for
i
in
range
(
config
.
num_hidden_layers
)]
...
@@ -392,7 +392,7 @@ class TFBertEncoder(tf.keras.layers.Layer):
...
@@ -392,7 +392,7 @@ class TFBertEncoder(tf.keras.layers.Layer):
class
TFBertPooler
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertPooler
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertPooler
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
...
@@ -410,7 +410,7 @@ class TFBertPooler(tf.keras.layers.Layer):
...
@@ -410,7 +410,7 @@ class TFBertPooler(tf.keras.layers.Layer):
class
TFBertPredictionHeadTransform
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertPredictionHeadTransform
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertPredictionHeadTransform
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
)
)
...
@@ -429,7 +429,7 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
...
@@ -429,7 +429,7 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
class
TFBertLMPredictionHead
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertLMPredictionHead
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFBertLMPredictionHead
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
transform
=
TFBertPredictionHeadTransform
(
config
,
name
=
"transform"
)
self
.
transform
=
TFBertPredictionHeadTransform
(
config
,
name
=
"transform"
)
...
@@ -439,7 +439,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
...
@@ -439,7 +439,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
super
(
TFBertLMPredictionHead
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
hidden_states
):
def
call
(
self
,
hidden_states
):
hidden_states
=
self
.
transform
(
hidden_states
)
hidden_states
=
self
.
transform
(
hidden_states
)
...
@@ -450,7 +450,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
...
@@ -450,7 +450,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
class
TFBertMLMHead
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertMLMHead
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFBertMLMHead
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
predictions
=
TFBertLMPredictionHead
(
config
,
input_embeddings
,
name
=
"predictions"
)
self
.
predictions
=
TFBertLMPredictionHead
(
config
,
input_embeddings
,
name
=
"predictions"
)
def
call
(
self
,
sequence_output
):
def
call
(
self
,
sequence_output
):
...
@@ -460,7 +460,7 @@ class TFBertMLMHead(tf.keras.layers.Layer):
...
@@ -460,7 +460,7 @@ class TFBertMLMHead(tf.keras.layers.Layer):
class
TFBertNSPHead
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertNSPHead
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertNSPHead
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
seq_relationship
=
tf
.
keras
.
layers
.
Dense
(
self
.
seq_relationship
=
tf
.
keras
.
layers
.
Dense
(
2
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"seq_relationship"
2
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"seq_relationship"
)
)
...
@@ -472,7 +472,7 @@ class TFBertNSPHead(tf.keras.layers.Layer):
...
@@ -472,7 +472,7 @@ class TFBertNSPHead(tf.keras.layers.Layer):
class
TFBertMainLayer
(
tf
.
keras
.
layers
.
Layer
):
class
TFBertMainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertMainLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
num_hidden_layers
=
config
.
num_hidden_layers
self
.
num_hidden_layers
=
config
.
num_hidden_layers
self
.
embeddings
=
TFBertEmbeddings
(
config
,
name
=
"embeddings"
)
self
.
embeddings
=
TFBertEmbeddings
(
config
,
name
=
"embeddings"
)
...
@@ -707,7 +707,7 @@ class TFBertModel(TFBertPreTrainedModel):
...
@@ -707,7 +707,7 @@ class TFBertModel(TFBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
...
@@ -750,7 +750,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
...
@@ -750,7 +750,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForPreTraining
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
nsp
=
TFBertNSPHead
(
config
,
name
=
"nsp___cls"
)
self
.
nsp
=
TFBertNSPHead
(
config
,
name
=
"nsp___cls"
)
...
@@ -803,7 +803,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
...
@@ -803,7 +803,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForMaskedLM
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
mlm
=
TFBertMLMHead
(
config
,
self
.
bert
.
embeddings
,
name
=
"mlm___cls"
)
self
.
mlm
=
TFBertMLMHead
(
config
,
self
.
bert
.
embeddings
,
name
=
"mlm___cls"
)
...
@@ -854,7 +854,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
...
@@ -854,7 +854,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForNextSentencePrediction
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
nsp
=
TFBertNSPHead
(
config
,
name
=
"nsp___cls"
)
self
.
nsp
=
TFBertNSPHead
(
config
,
name
=
"nsp___cls"
)
...
@@ -903,7 +903,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
...
@@ -903,7 +903,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForSequenceClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
...
@@ -960,7 +960,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
...
@@ -960,7 +960,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForMultipleChoice
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
hidden_dropout_prob
)
...
@@ -1064,7 +1064,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
...
@@ -1064,7 +1064,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForTokenClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
...
@@ -1121,7 +1121,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel):
...
@@ -1121,7 +1121,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForQuestionAnswering
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
...
...
src/transformers/modeling_tf_ctrl.py
View file @
dc17f2a1
...
@@ -75,7 +75,7 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N
...
@@ -75,7 +75,7 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N
class
TFMultiHeadAttention
(
tf
.
keras
.
layers
.
Layer
):
class
TFMultiHeadAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
d_model_size
,
num_heads
,
output_attentions
=
False
,
**
kwargs
):
def
__init__
(
self
,
d_model_size
,
num_heads
,
output_attentions
=
False
,
**
kwargs
):
super
(
TFMultiHeadAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
output_attentions
self
.
output_attentions
=
output_attentions
self
.
num_heads
=
num_heads
self
.
num_heads
=
num_heads
self
.
d_model_size
=
d_model_size
self
.
d_model_size
=
d_model_size
...
@@ -132,7 +132,7 @@ class TFEncoderLayer(tf.keras.layers.Layer):
...
@@ -132,7 +132,7 @@ class TFEncoderLayer(tf.keras.layers.Layer):
def
__init__
(
def
__init__
(
self
,
d_model_size
,
num_heads
,
dff
,
rate
=
0.1
,
layer_norm_epsilon
=
1e-6
,
output_attentions
=
False
,
**
kwargs
self
,
d_model_size
,
num_heads
,
dff
,
rate
=
0.1
,
layer_norm_epsilon
=
1e-6
,
output_attentions
=
False
,
**
kwargs
):
):
super
(
TFEncoderLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
multi_head_attention
=
TFMultiHeadAttention
(
self
.
multi_head_attention
=
TFMultiHeadAttention
(
d_model_size
,
num_heads
,
output_attentions
,
name
=
"multi_head_attention"
d_model_size
,
num_heads
,
output_attentions
,
name
=
"multi_head_attention"
...
@@ -166,7 +166,7 @@ class TFEncoderLayer(tf.keras.layers.Layer):
...
@@ -166,7 +166,7 @@ class TFEncoderLayer(tf.keras.layers.Layer):
class
TFCTRLMainLayer
(
tf
.
keras
.
layers
.
Layer
):
class
TFCTRLMainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFCTRLMainLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_past
=
config
.
output_past
self
.
output_past
=
config
.
output_past
...
@@ -443,7 +443,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
...
@@ -443,7 +443,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFCTRLModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFCTRLMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFCTRLMainLayer
(
config
,
name
=
"transformer"
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
...
@@ -453,7 +453,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
...
@@ -453,7 +453,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
class
TFCTRLLMHead
(
tf
.
keras
.
layers
.
Layer
):
class
TFCTRLLMHead
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFCTRLLMHead
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
# The output weights are the same as the input embeddings, but there is
# The output weights are the same as the input embeddings, but there is
...
@@ -462,7 +462,7 @@ class TFCTRLLMHead(tf.keras.layers.Layer):
...
@@ -462,7 +462,7 @@ class TFCTRLLMHead(tf.keras.layers.Layer):
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
super
(
TFCTRLLMHead
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
hidden_states
):
def
call
(
self
,
hidden_states
):
hidden_states
=
self
.
input_embeddings
(
hidden_states
,
mode
=
"linear"
)
hidden_states
=
self
.
input_embeddings
(
hidden_states
,
mode
=
"linear"
)
...
@@ -508,7 +508,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
...
@@ -508,7 +508,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFCTRLLMHeadModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFCTRLMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFCTRLMainLayer
(
config
,
name
=
"transformer"
)
self
.
lm_head
=
TFCTRLLMHead
(
config
,
self
.
transformer
.
w
,
name
=
"lm_head"
)
self
.
lm_head
=
TFCTRLLMHead
(
config
,
self
.
transformer
.
w
,
name
=
"lm_head"
)
...
...
src/transformers/modeling_tf_distilbert.py
View file @
dc17f2a1
...
@@ -65,7 +65,7 @@ def gelu_new(x):
...
@@ -65,7 +65,7 @@ def gelu_new(x):
class
TFEmbeddings
(
tf
.
keras
.
layers
.
Layer
):
class
TFEmbeddings
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFEmbeddings
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
dim
=
config
.
dim
self
.
dim
=
config
.
dim
self
.
initializer_range
=
config
.
initializer_range
self
.
initializer_range
=
config
.
initializer_range
...
@@ -92,7 +92,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
...
@@ -92,7 +92,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
self
.
word_embeddings
=
self
.
add_weight
(
self
.
word_embeddings
=
self
.
add_weight
(
"weight"
,
shape
=
[
self
.
vocab_size
,
self
.
dim
],
initializer
=
get_initializer
(
self
.
initializer_range
)
"weight"
,
shape
=
[
self
.
vocab_size
,
self
.
dim
],
initializer
=
get_initializer
(
self
.
initializer_range
)
)
)
super
(
TFEmbeddings
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
inputs
,
inputs_embeds
=
None
,
mode
=
"embedding"
,
training
=
False
):
def
call
(
self
,
inputs
,
inputs_embeds
=
None
,
mode
=
"embedding"
,
training
=
False
):
"""Get token embeddings of inputs.
"""Get token embeddings of inputs.
...
@@ -169,7 +169,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
...
@@ -169,7 +169,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
class
TFMultiHeadSelfAttention
(
tf
.
keras
.
layers
.
Layer
):
class
TFMultiHeadSelfAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFMultiHeadSelfAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
n_heads
=
config
.
n_heads
self
.
n_heads
=
config
.
n_heads
self
.
dim
=
config
.
dim
self
.
dim
=
config
.
dim
...
@@ -259,7 +259,7 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
...
@@ -259,7 +259,7 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
class
TFFFN
(
tf
.
keras
.
layers
.
Layer
):
class
TFFFN
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFFFN
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
lin1
=
tf
.
keras
.
layers
.
Dense
(
self
.
lin1
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_dim
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"lin1"
config
.
hidden_dim
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"lin1"
...
@@ -284,7 +284,7 @@ class TFFFN(tf.keras.layers.Layer):
...
@@ -284,7 +284,7 @@ class TFFFN(tf.keras.layers.Layer):
class
TFTransformerBlock
(
tf
.
keras
.
layers
.
Layer
):
class
TFTransformerBlock
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFTransformerBlock
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
n_heads
=
config
.
n_heads
self
.
n_heads
=
config
.
n_heads
self
.
dim
=
config
.
dim
self
.
dim
=
config
.
dim
...
@@ -338,7 +338,7 @@ class TFTransformerBlock(tf.keras.layers.Layer):
...
@@ -338,7 +338,7 @@ class TFTransformerBlock(tf.keras.layers.Layer):
class
TFTransformer
(
tf
.
keras
.
layers
.
Layer
):
class
TFTransformer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFTransformer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
n_layers
=
config
.
n_layers
self
.
n_layers
=
config
.
n_layers
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
...
@@ -399,7 +399,7 @@ class TFTransformer(tf.keras.layers.Layer):
...
@@ -399,7 +399,7 @@ class TFTransformer(tf.keras.layers.Layer):
class
TFDistilBertMainLayer
(
tf
.
keras
.
layers
.
Layer
):
class
TFDistilBertMainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFDistilBertMainLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
num_hidden_layers
=
config
.
num_hidden_layers
self
.
num_hidden_layers
=
config
.
num_hidden_layers
self
.
embeddings
=
TFEmbeddings
(
config
,
name
=
"embeddings"
)
# Embeddings
self
.
embeddings
=
TFEmbeddings
(
config
,
name
=
"embeddings"
)
# Embeddings
...
@@ -569,7 +569,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
...
@@ -569,7 +569,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFDistilBertModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
# Embeddings
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
# Embeddings
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
...
@@ -579,7 +579,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
...
@@ -579,7 +579,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
class
TFDistilBertLMHead
(
tf
.
keras
.
layers
.
Layer
):
class
TFDistilBertLMHead
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFDistilBertLMHead
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
# The output weights are the same as the input embeddings, but there is
# The output weights are the same as the input embeddings, but there is
...
@@ -588,7 +588,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
...
@@ -588,7 +588,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
super
(
TFDistilBertLMHead
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
hidden_states
):
def
call
(
self
,
hidden_states
):
hidden_states
=
self
.
input_embeddings
(
hidden_states
,
mode
=
"linear"
)
hidden_states
=
self
.
input_embeddings
(
hidden_states
,
mode
=
"linear"
)
...
@@ -628,7 +628,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
...
@@ -628,7 +628,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFDistilBertForMaskedLM
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
...
@@ -690,7 +690,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
...
@@ -690,7 +690,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFDistilBertForSequenceClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
...
@@ -747,7 +747,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
...
@@ -747,7 +747,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFDistilBertForTokenClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
...
@@ -804,7 +804,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
...
@@ -804,7 +804,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFDistilBertForQuestionAnswering
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
...
...
src/transformers/modeling_tf_gpt2.py
View file @
dc17f2a1
...
@@ -58,7 +58,7 @@ def gelu(x):
...
@@ -58,7 +58,7 @@ def gelu(x):
class
TFAttention
(
tf
.
keras
.
layers
.
Layer
):
class
TFAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
nx
,
n_ctx
,
config
,
scale
=
False
,
**
kwargs
):
def
__init__
(
self
,
nx
,
n_ctx
,
config
,
scale
=
False
,
**
kwargs
):
super
(
TFAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
n_state
=
nx
# in Attention: n_state=768 (nx=n_embd)
n_state
=
nx
# in Attention: n_state=768 (nx=n_embd)
...
@@ -157,7 +157,7 @@ class TFAttention(tf.keras.layers.Layer):
...
@@ -157,7 +157,7 @@ class TFAttention(tf.keras.layers.Layer):
class
TFMLP
(
tf
.
keras
.
layers
.
Layer
):
class
TFMLP
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
n_state
,
config
,
**
kwargs
):
def
__init__
(
self
,
n_state
,
config
,
**
kwargs
):
super
(
TFMLP
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
nx
=
config
.
n_embd
nx
=
config
.
n_embd
self
.
c_fc
=
TFConv1D
(
n_state
,
nx
,
initializer_range
=
config
.
initializer_range
,
name
=
"c_fc"
)
self
.
c_fc
=
TFConv1D
(
n_state
,
nx
,
initializer_range
=
config
.
initializer_range
,
name
=
"c_fc"
)
self
.
c_proj
=
TFConv1D
(
nx
,
n_state
,
initializer_range
=
config
.
initializer_range
,
name
=
"c_proj"
)
self
.
c_proj
=
TFConv1D
(
nx
,
n_state
,
initializer_range
=
config
.
initializer_range
,
name
=
"c_proj"
)
...
@@ -173,7 +173,7 @@ class TFMLP(tf.keras.layers.Layer):
...
@@ -173,7 +173,7 @@ class TFMLP(tf.keras.layers.Layer):
class
TFBlock
(
tf
.
keras
.
layers
.
Layer
):
class
TFBlock
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
n_ctx
,
config
,
scale
=
False
,
**
kwargs
):
def
__init__
(
self
,
n_ctx
,
config
,
scale
=
False
,
**
kwargs
):
super
(
TFBlock
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
nx
=
config
.
n_embd
nx
=
config
.
n_embd
self
.
ln_1
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_epsilon
,
name
=
"ln_1"
)
self
.
ln_1
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_epsilon
,
name
=
"ln_1"
)
self
.
attn
=
TFAttention
(
nx
,
n_ctx
,
config
,
scale
,
name
=
"attn"
)
self
.
attn
=
TFAttention
(
nx
,
n_ctx
,
config
,
scale
,
name
=
"attn"
)
...
@@ -198,7 +198,7 @@ class TFBlock(tf.keras.layers.Layer):
...
@@ -198,7 +198,7 @@ class TFBlock(tf.keras.layers.Layer):
class
TFGPT2MainLayer
(
tf
.
keras
.
layers
.
Layer
):
class
TFGPT2MainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFGPT2MainLayer
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
num_hidden_layers
=
config
.
n_layer
self
.
num_hidden_layers
=
config
.
n_layer
...
@@ -475,7 +475,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
...
@@ -475,7 +475,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFGPT2Model
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFGPT2MainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFGPT2MainLayer
(
config
,
name
=
"transformer"
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
...
@@ -521,7 +521,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
...
@@ -521,7 +521,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFGPT2LMHeadModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFGPT2MainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFGPT2MainLayer
(
config
,
name
=
"transformer"
)
def
get_output_embeddings
(
self
):
def
get_output_embeddings
(
self
):
...
@@ -598,7 +598,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
...
@@ -598,7 +598,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFGPT2DoubleHeadsModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
config
.
num_labels
=
1
config
.
num_labels
=
1
self
.
transformer
=
TFGPT2MainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFGPT2MainLayer
(
config
,
name
=
"transformer"
)
self
.
multiple_choice_head
=
TFSequenceSummary
(
self
.
multiple_choice_head
=
TFSequenceSummary
(
...
...
src/transformers/modeling_tf_openai.py
View file @
dc17f2a1
...
@@ -66,7 +66,7 @@ ACT_FNS = {
...
@@ -66,7 +66,7 @@ ACT_FNS = {
class
TFAttention
(
tf
.
keras
.
layers
.
Layer
):
class
TFAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
nx
,
n_ctx
,
config
,
scale
=
False
,
**
kwargs
):
def
__init__
(
self
,
nx
,
n_ctx
,
config
,
scale
=
False
,
**
kwargs
):
super
(
TFAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
n_state
=
nx
# in Attention: n_state=768 (nx=n_embd)
n_state
=
nx
# in Attention: n_state=768 (nx=n_embd)
...
@@ -160,7 +160,7 @@ class TFAttention(tf.keras.layers.Layer):
...
@@ -160,7 +160,7 @@ class TFAttention(tf.keras.layers.Layer):
class
TFMLP
(
tf
.
keras
.
layers
.
Layer
):
class
TFMLP
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
n_state
,
config
,
**
kwargs
):
def
__init__
(
self
,
n_state
,
config
,
**
kwargs
):
super
(
TFMLP
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
nx
=
config
.
n_embd
nx
=
config
.
n_embd
self
.
c_fc
=
TFConv1D
(
n_state
,
nx
,
initializer_range
=
config
.
initializer_range
,
name
=
"c_fc"
)
self
.
c_fc
=
TFConv1D
(
n_state
,
nx
,
initializer_range
=
config
.
initializer_range
,
name
=
"c_fc"
)
self
.
c_proj
=
TFConv1D
(
nx
,
n_state
,
initializer_range
=
config
.
initializer_range
,
name
=
"c_proj"
)
self
.
c_proj
=
TFConv1D
(
nx
,
n_state
,
initializer_range
=
config
.
initializer_range
,
name
=
"c_proj"
)
...
@@ -176,7 +176,7 @@ class TFMLP(tf.keras.layers.Layer):
...
@@ -176,7 +176,7 @@ class TFMLP(tf.keras.layers.Layer):
class
TFBlock
(
tf
.
keras
.
layers
.
Layer
):
class
TFBlock
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
n_ctx
,
config
,
scale
=
False
,
**
kwargs
):
def
__init__
(
self
,
n_ctx
,
config
,
scale
=
False
,
**
kwargs
):
super
(
TFBlock
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
nx
=
config
.
n_embd
nx
=
config
.
n_embd
self
.
attn
=
TFAttention
(
nx
,
n_ctx
,
config
,
scale
,
name
=
"attn"
)
self
.
attn
=
TFAttention
(
nx
,
n_ctx
,
config
,
scale
,
name
=
"attn"
)
self
.
ln_1
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_epsilon
,
name
=
"ln_1"
)
self
.
ln_1
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_epsilon
,
name
=
"ln_1"
)
...
@@ -199,7 +199,7 @@ class TFBlock(tf.keras.layers.Layer):
...
@@ -199,7 +199,7 @@ class TFBlock(tf.keras.layers.Layer):
class
TFOpenAIGPTMainLayer
(
tf
.
keras
.
layers
.
Layer
):
class
TFOpenAIGPTMainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFOpenAIGPTMainLayer
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
num_hidden_layers
=
config
.
n_layer
self
.
num_hidden_layers
=
config
.
n_layer
...
@@ -453,7 +453,7 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
...
@@ -453,7 +453,7 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFOpenAIGPTModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFOpenAIGPTMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFOpenAIGPTMainLayer
(
config
,
name
=
"transformer"
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
...
@@ -494,7 +494,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
...
@@ -494,7 +494,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFOpenAIGPTLMHeadModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFOpenAIGPTMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFOpenAIGPTMainLayer
(
config
,
name
=
"transformer"
)
def
get_output_embeddings
(
self
):
def
get_output_embeddings
(
self
):
...
@@ -563,7 +563,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
...
@@ -563,7 +563,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFOpenAIGPTDoubleHeadsModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
config
.
num_labels
=
1
config
.
num_labels
=
1
self
.
transformer
=
TFOpenAIGPTMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFOpenAIGPTMainLayer
(
config
,
name
=
"transformer"
)
self
.
multiple_choice_head
=
TFSequenceSummary
(
self
.
multiple_choice_head
=
TFSequenceSummary
(
...
...
src/transformers/modeling_tf_roberta.py
View file @
dc17f2a1
...
@@ -42,7 +42,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
...
@@ -42,7 +42,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
"""
"""
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFRobertaEmbeddings
,
self
).
__init__
(
config
,
**
kwargs
)
super
().
__init__
(
config
,
**
kwargs
)
self
.
padding_idx
=
1
self
.
padding_idx
=
1
def
create_position_ids_from_input_ids
(
self
,
x
):
def
create_position_ids_from_input_ids
(
self
,
x
):
...
@@ -78,9 +78,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
...
@@ -78,9 +78,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
else
:
else
:
position_ids
=
self
.
create_position_ids_from_inputs_embeds
(
inputs_embeds
)
position_ids
=
self
.
create_position_ids_from_inputs_embeds
(
inputs_embeds
)
return
super
(
TFRobertaEmbeddings
,
self
).
_embedding
(
return
super
().
_embedding
([
input_ids
,
position_ids
,
token_type_ids
,
inputs_embeds
],
training
=
training
)
[
input_ids
,
position_ids
,
token_type_ids
,
inputs_embeds
],
training
=
training
)
class
TFRobertaMainLayer
(
TFBertMainLayer
):
class
TFRobertaMainLayer
(
TFBertMainLayer
):
...
@@ -89,7 +87,7 @@ class TFRobertaMainLayer(TFBertMainLayer):
...
@@ -89,7 +87,7 @@ class TFRobertaMainLayer(TFBertMainLayer):
"""
"""
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFRobertaMainLayer
,
self
).
__init__
(
config
,
**
kwargs
)
super
().
__init__
(
config
,
**
kwargs
)
self
.
embeddings
=
TFRobertaEmbeddings
(
config
,
name
=
"embeddings"
)
self
.
embeddings
=
TFRobertaEmbeddings
(
config
,
name
=
"embeddings"
)
def
get_input_embeddings
(
self
):
def
get_input_embeddings
(
self
):
...
@@ -234,7 +232,7 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
...
@@ -234,7 +232,7 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFRobertaModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
roberta
=
TFRobertaMainLayer
(
config
,
name
=
"roberta"
)
self
.
roberta
=
TFRobertaMainLayer
(
config
,
name
=
"roberta"
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
...
@@ -246,7 +244,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
...
@@ -246,7 +244,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
"""Roberta Head for masked language modeling."""
"""Roberta Head for masked language modeling."""
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFRobertaLMHead
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
...
@@ -260,7 +258,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
...
@@ -260,7 +258,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
super
(
TFRobertaLMHead
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
features
):
def
call
(
self
,
features
):
x
=
self
.
dense
(
features
)
x
=
self
.
dense
(
features
)
...
@@ -305,7 +303,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
...
@@ -305,7 +303,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFRobertaForMaskedLM
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
roberta
=
TFRobertaMainLayer
(
config
,
name
=
"roberta"
)
self
.
roberta
=
TFRobertaMainLayer
(
config
,
name
=
"roberta"
)
self
.
lm_head
=
TFRobertaLMHead
(
config
,
self
.
roberta
.
embeddings
,
name
=
"lm_head"
)
self
.
lm_head
=
TFRobertaLMHead
(
config
,
self
.
roberta
.
embeddings
,
name
=
"lm_head"
)
...
@@ -328,7 +326,7 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
...
@@ -328,7 +326,7 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
"""Head for sentence-level classification tasks."""
"""Head for sentence-level classification tasks."""
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFRobertaClassificationHead
,
self
).
__init__
(
config
,
**
kwargs
)
super
().
__init__
(
config
,
**
kwargs
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
...
@@ -383,7 +381,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
...
@@ -383,7 +381,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFRobertaForSequenceClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
roberta
=
TFRobertaMainLayer
(
config
,
name
=
"roberta"
)
self
.
roberta
=
TFRobertaMainLayer
(
config
,
name
=
"roberta"
)
...
@@ -433,7 +431,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
...
@@ -433,7 +431,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFRobertaForTokenClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
roberta
=
TFRobertaMainLayer
(
config
,
name
=
"roberta"
)
self
.
roberta
=
TFRobertaMainLayer
(
config
,
name
=
"roberta"
)
...
...
src/transformers/modeling_tf_t5.py
View file @
dc17f2a1
...
@@ -50,13 +50,13 @@ class TFT5LayerNorm(tf.keras.layers.Layer):
...
@@ -50,13 +50,13 @@ class TFT5LayerNorm(tf.keras.layers.Layer):
""" Construct a layernorm module in the T5 style
""" Construct a layernorm module in the T5 style
No bias and no substraction of mean.
No bias and no substraction of mean.
"""
"""
super
(
TFT5LayerNorm
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
variance_epsilon
=
epsilon
self
.
variance_epsilon
=
epsilon
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
"""Build shared word embedding layer """
"""Build shared word embedding layer """
self
.
weight
=
self
.
add_weight
(
"weight"
,
shape
=
(
input_shape
[
-
1
],),
initializer
=
"ones"
)
self
.
weight
=
self
.
add_weight
(
"weight"
,
shape
=
(
input_shape
[
-
1
],),
initializer
=
"ones"
)
super
(
TFT5LayerNorm
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
x
):
def
call
(
self
,
x
):
variance
=
tf
.
math
.
reduce_mean
(
tf
.
math
.
square
(
x
),
axis
=-
1
,
keepdims
=
True
)
variance
=
tf
.
math
.
reduce_mean
(
tf
.
math
.
square
(
x
),
axis
=-
1
,
keepdims
=
True
)
...
@@ -66,7 +66,7 @@ class TFT5LayerNorm(tf.keras.layers.Layer):
...
@@ -66,7 +66,7 @@ class TFT5LayerNorm(tf.keras.layers.Layer):
class
TFT5DenseReluDense
(
tf
.
keras
.
layers
.
Layer
):
class
TFT5DenseReluDense
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFT5DenseReluDense
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
wi
=
tf
.
keras
.
layers
.
Dense
(
config
.
d_ff
,
use_bias
=
False
,
name
=
"wi"
)
self
.
wi
=
tf
.
keras
.
layers
.
Dense
(
config
.
d_ff
,
use_bias
=
False
,
name
=
"wi"
)
self
.
wo
=
tf
.
keras
.
layers
.
Dense
(
config
.
d_model
,
use_bias
=
False
,
name
=
"wo"
)
self
.
wo
=
tf
.
keras
.
layers
.
Dense
(
config
.
d_model
,
use_bias
=
False
,
name
=
"wo"
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout_rate
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout_rate
)
...
@@ -82,7 +82,7 @@ class TFT5DenseReluDense(tf.keras.layers.Layer):
...
@@ -82,7 +82,7 @@ class TFT5DenseReluDense(tf.keras.layers.Layer):
class
TFT5LayerFF
(
tf
.
keras
.
layers
.
Layer
):
class
TFT5LayerFF
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFT5LayerFF
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
DenseReluDense
=
TFT5DenseReluDense
(
config
,
name
=
"DenseReluDense"
)
self
.
DenseReluDense
=
TFT5DenseReluDense
(
config
,
name
=
"DenseReluDense"
)
self
.
layer_norm
=
TFT5LayerNorm
(
epsilon
=
config
.
layer_norm_epsilon
,
name
=
"layer_norm"
)
self
.
layer_norm
=
TFT5LayerNorm
(
epsilon
=
config
.
layer_norm_epsilon
,
name
=
"layer_norm"
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout_rate
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout_rate
)
...
@@ -98,7 +98,7 @@ class TFT5Attention(tf.keras.layers.Layer):
...
@@ -98,7 +98,7 @@ class TFT5Attention(tf.keras.layers.Layer):
NEW_ID
=
itertools
.
count
()
NEW_ID
=
itertools
.
count
()
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
,
**
kwargs
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
,
**
kwargs
):
super
(
TFT5Attention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
layer_id
=
next
(
TFT5Attention
.
NEW_ID
)
self
.
layer_id
=
next
(
TFT5Attention
.
NEW_ID
)
self
.
is_decoder
=
config
.
is_decoder
self
.
is_decoder
=
config
.
is_decoder
self
.
has_relative_attention_bias
=
has_relative_attention_bias
self
.
has_relative_attention_bias
=
has_relative_attention_bias
...
@@ -259,7 +259,7 @@ class TFT5Attention(tf.keras.layers.Layer):
...
@@ -259,7 +259,7 @@ class TFT5Attention(tf.keras.layers.Layer):
class
TFT5LayerSelfAttention
(
tf
.
keras
.
layers
.
Layer
):
class
TFT5LayerSelfAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
,
**
kwargs
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
,
**
kwargs
):
super
(
TFT5LayerSelfAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
SelfAttention
=
TFT5Attention
(
self
.
SelfAttention
=
TFT5Attention
(
config
,
has_relative_attention_bias
=
has_relative_attention_bias
,
name
=
"SelfAttention"
config
,
has_relative_attention_bias
=
has_relative_attention_bias
,
name
=
"SelfAttention"
)
)
...
@@ -279,7 +279,7 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer):
...
@@ -279,7 +279,7 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer):
class
TFT5LayerCrossAttention
(
tf
.
keras
.
layers
.
Layer
):
class
TFT5LayerCrossAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
,
**
kwargs
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
,
**
kwargs
):
super
(
TFT5LayerCrossAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
EncDecAttention
=
TFT5Attention
(
self
.
EncDecAttention
=
TFT5Attention
(
config
,
has_relative_attention_bias
=
has_relative_attention_bias
,
name
=
"EncDecAttention"
config
,
has_relative_attention_bias
=
has_relative_attention_bias
,
name
=
"EncDecAttention"
)
)
...
@@ -299,7 +299,7 @@ class TFT5LayerCrossAttention(tf.keras.layers.Layer):
...
@@ -299,7 +299,7 @@ class TFT5LayerCrossAttention(tf.keras.layers.Layer):
class
TFT5Block
(
tf
.
keras
.
layers
.
Layer
):
class
TFT5Block
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
,
**
kwargs
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
,
**
kwargs
):
super
(
TFT5Block
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
is_decoder
=
config
.
is_decoder
self
.
is_decoder
=
config
.
is_decoder
self
.
layer
=
[]
self
.
layer
=
[]
self
.
layer
.
append
(
self
.
layer
.
append
(
...
@@ -361,7 +361,7 @@ class TFT5Block(tf.keras.layers.Layer):
...
@@ -361,7 +361,7 @@ class TFT5Block(tf.keras.layers.Layer):
####################################################
####################################################
class
TFT5MainLayer
(
tf
.
keras
.
layers
.
Layer
):
class
TFT5MainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFT5MainLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
is_decoder
=
config
.
is_decoder
self
.
is_decoder
=
config
.
is_decoder
...
@@ -633,7 +633,7 @@ class TFT5Model(TFT5PreTrainedModel):
...
@@ -633,7 +633,7 @@ class TFT5Model(TFT5PreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFT5Model
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
shared
=
TFSharedEmbeddings
(
config
.
vocab_size
,
config
.
d_model
,
name
=
"shared"
)
self
.
shared
=
TFSharedEmbeddings
(
config
.
vocab_size
,
config
.
d_model
,
name
=
"shared"
)
encoder_config
=
copy
.
deepcopy
(
config
)
encoder_config
=
copy
.
deepcopy
(
config
)
...
@@ -724,7 +724,7 @@ class TFT5WithLMHeadModel(TFT5PreTrainedModel):
...
@@ -724,7 +724,7 @@ class TFT5WithLMHeadModel(TFT5PreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFT5WithLMHeadModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
model_dim
=
config
.
d_model
self
.
model_dim
=
config
.
d_model
self
.
shared
=
TFSharedEmbeddings
(
config
.
vocab_size
,
config
.
d_model
,
name
=
"shared"
)
self
.
shared
=
TFSharedEmbeddings
(
config
.
vocab_size
,
config
.
d_model
,
name
=
"shared"
)
...
...
src/transformers/modeling_tf_transfo_xl.py
View file @
dc17f2a1
...
@@ -36,7 +36,7 @@ TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
...
@@ -36,7 +36,7 @@ TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
class
TFPositionalEmbedding
(
tf
.
keras
.
layers
.
Layer
):
class
TFPositionalEmbedding
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
demb
,
**
kwargs
):
def
__init__
(
self
,
demb
,
**
kwargs
):
super
(
TFPositionalEmbedding
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
inv_freq
=
1
/
(
10000
**
(
tf
.
range
(
0
,
demb
,
2.0
)
/
demb
))
self
.
inv_freq
=
1
/
(
10000
**
(
tf
.
range
(
0
,
demb
,
2.0
)
/
demb
))
...
@@ -52,7 +52,7 @@ class TFPositionalEmbedding(tf.keras.layers.Layer):
...
@@ -52,7 +52,7 @@ class TFPositionalEmbedding(tf.keras.layers.Layer):
class
TFPositionwiseFF
(
tf
.
keras
.
layers
.
Layer
):
class
TFPositionwiseFF
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
d_model
,
d_inner
,
dropout
,
pre_lnorm
=
False
,
layer_norm_epsilon
=
1e-5
,
init_std
=
0.02
,
**
kwargs
):
def
__init__
(
self
,
d_model
,
d_inner
,
dropout
,
pre_lnorm
=
False
,
layer_norm_epsilon
=
1e-5
,
init_std
=
0.02
,
**
kwargs
):
super
(
TFPositionwiseFF
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
d_model
=
d_model
self
.
d_model
=
d_model
self
.
d_inner
=
d_inner
self
.
d_inner
=
d_inner
...
@@ -112,7 +112,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
...
@@ -112,7 +112,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
init_std
=
0.02
,
init_std
=
0.02
,
**
kwargs
**
kwargs
):
):
super
(
TFRelPartialLearnableMultiHeadAttn
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
output_attentions
self
.
output_attentions
=
output_attentions
self
.
n_head
=
n_head
self
.
n_head
=
n_head
...
@@ -155,7 +155,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
...
@@ -155,7 +155,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
self
.
r_w_bias
=
self
.
add_weight
(
self
.
r_w_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"r_w_bias"
shape
=
(
self
.
n_head
,
self
.
d_head
),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"r_w_bias"
)
)
super
(
TFRelPartialLearnableMultiHeadAttn
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
_rel_shift
(
self
,
x
):
def
_rel_shift
(
self
,
x
):
x_size
=
shape_list
(
x
)
x_size
=
shape_list
(
x
)
...
@@ -267,7 +267,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
...
@@ -267,7 +267,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
init_std
=
0.02
,
init_std
=
0.02
,
**
kwargs
**
kwargs
):
):
super
(
TFRelPartialLearnableDecoderLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
dec_attn
=
TFRelPartialLearnableMultiHeadAttn
(
self
.
dec_attn
=
TFRelPartialLearnableMultiHeadAttn
(
n_head
,
n_head
,
...
@@ -308,7 +308,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
...
@@ -308,7 +308,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
class
TFAdaptiveEmbedding
(
tf
.
keras
.
layers
.
Layer
):
class
TFAdaptiveEmbedding
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
n_token
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
init_std
=
0.02
,
sample_softmax
=
False
,
**
kwargs
):
def
__init__
(
self
,
n_token
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
init_std
=
0.02
,
sample_softmax
=
False
,
**
kwargs
):
super
(
TFAdaptiveEmbedding
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
n_token
=
n_token
self
.
n_token
=
n_token
self
.
d_embed
=
d_embed
self
.
d_embed
=
d_embed
...
@@ -350,7 +350,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
...
@@ -350,7 +350,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
name
=
"emb_projs_._{}"
.
format
(
i
),
name
=
"emb_projs_._{}"
.
format
(
i
),
)
)
)
)
super
(
TFAdaptiveEmbedding
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
inp
):
def
call
(
self
,
inp
):
if
self
.
div_val
==
1
:
if
self
.
div_val
==
1
:
...
@@ -380,7 +380,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
...
@@ -380,7 +380,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
class
TFTransfoXLMainLayer
(
tf
.
keras
.
layers
.
Layer
):
class
TFTransfoXLMainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFTransfoXLMainLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
...
@@ -455,7 +455,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
...
@@ -455,7 +455,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
self
.
r_r_bias
=
self
.
add_weight
(
self
.
r_r_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"r_r_bias"
shape
=
(
self
.
n_head
,
self
.
d_head
),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"r_r_bias"
)
)
super
(
TFTransfoXLMainLayer
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
get_input_embeddings
(
self
):
def
get_input_embeddings
(
self
):
return
self
.
word_emb
return
self
.
word_emb
...
@@ -728,7 +728,7 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
...
@@ -728,7 +728,7 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFTransfoXLModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFTransfoXLMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFTransfoXLMainLayer
(
config
,
name
=
"transformer"
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
...
@@ -774,7 +774,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
...
@@ -774,7 +774,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
TFTransfoXLLMHeadModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
transformer
=
TFTransfoXLMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFTransfoXLMainLayer
(
config
,
name
=
"transformer"
)
self
.
sample_softmax
=
config
.
sample_softmax
self
.
sample_softmax
=
config
.
sample_softmax
# use sampled softmax
# use sampled softmax
...
...
src/transformers/modeling_tf_transfo_xl_utilities.py
View file @
dc17f2a1
...
@@ -24,7 +24,7 @@ from .modeling_tf_utils import shape_list
...
@@ -24,7 +24,7 @@ from .modeling_tf_utils import shape_list
class
TFAdaptiveSoftmaxMask
(
tf
.
keras
.
layers
.
Layer
):
class
TFAdaptiveSoftmaxMask
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
vocab_size
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
keep_order
=
False
,
**
kwargs
):
def
__init__
(
self
,
vocab_size
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
keep_order
=
False
,
**
kwargs
):
super
(
TFAdaptiveSoftmaxMask
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
vocab_size
self
.
vocab_size
=
vocab_size
self
.
d_embed
=
d_embed
self
.
d_embed
=
d_embed
...
@@ -98,7 +98,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
...
@@ -98,7 +98,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
name
=
"out_layers_._{}_._bias"
.
format
(
i
),
name
=
"out_layers_._{}_._bias"
.
format
(
i
),
)
)
self
.
out_layers
.
append
((
weight
,
bias
))
self
.
out_layers
.
append
((
weight
,
bias
))
super
(
TFAdaptiveSoftmaxMask
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
@
staticmethod
@
staticmethod
def
_logit
(
x
,
W
,
b
,
proj
=
None
):
def
_logit
(
x
,
W
,
b
,
proj
=
None
):
...
...
src/transformers/modeling_tf_utils.py
View file @
dc17f2a1
...
@@ -78,7 +78,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
...
@@ -78,7 +78,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
return
{
"input_ids"
:
tf
.
constant
(
DUMMY_INPUTS
)}
return
{
"input_ids"
:
tf
.
constant
(
DUMMY_INPUTS
)}
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFPreTrainedModel
,
self
).
__init__
(
*
inputs
,
**
kwargs
)
super
().
__init__
(
*
inputs
,
**
kwargs
)
if
not
isinstance
(
config
,
PretrainedConfig
):
if
not
isinstance
(
config
,
PretrainedConfig
):
raise
ValueError
(
raise
ValueError
(
"Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
"Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
...
@@ -385,7 +385,7 @@ class TFConv1D(tf.keras.layers.Layer):
...
@@ -385,7 +385,7 @@ class TFConv1D(tf.keras.layers.Layer):
""" TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
""" TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
Basically works like a Linear layer but the weights are transposed
Basically works like a Linear layer but the weights are transposed
"""
"""
super
(
TFConv1D
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
nf
=
nf
self
.
nf
=
nf
self
.
nx
=
nx
self
.
nx
=
nx
self
.
initializer_range
=
initializer_range
self
.
initializer_range
=
initializer_range
...
@@ -412,7 +412,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
...
@@ -412,7 +412,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
"""
"""
def
__init__
(
self
,
vocab_size
,
hidden_size
,
initializer_range
=
None
,
**
kwargs
):
def
__init__
(
self
,
vocab_size
,
hidden_size
,
initializer_range
=
None
,
**
kwargs
):
super
(
TFSharedEmbeddings
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
vocab_size
self
.
vocab_size
=
vocab_size
self
.
hidden_size
=
hidden_size
self
.
hidden_size
=
hidden_size
self
.
initializer_range
=
hidden_size
**
-
0.5
if
initializer_range
is
None
else
initializer_range
self
.
initializer_range
=
hidden_size
**
-
0.5
if
initializer_range
is
None
else
initializer_range
...
@@ -425,7 +425,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
...
@@ -425,7 +425,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
self
.
weight
=
self
.
add_weight
(
self
.
weight
=
self
.
add_weight
(
"weight"
,
shape
=
[
self
.
vocab_size
,
self
.
hidden_size
],
initializer
=
get_initializer
(
self
.
initializer_range
)
"weight"
,
shape
=
[
self
.
vocab_size
,
self
.
hidden_size
],
initializer
=
get_initializer
(
self
.
initializer_range
)
)
)
super
(
TFSharedEmbeddings
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
inputs
,
mode
=
"embedding"
):
def
call
(
self
,
inputs
,
mode
=
"embedding"
):
"""Get token embeddings of inputs.
"""Get token embeddings of inputs.
...
@@ -485,7 +485,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
...
@@ -485,7 +485,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
"""
"""
def
__init__
(
self
,
config
,
initializer_range
=
0.02
,
**
kwargs
):
def
__init__
(
self
,
config
,
initializer_range
=
0.02
,
**
kwargs
):
super
(
TFSequenceSummary
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
summary_type
=
config
.
summary_type
if
hasattr
(
config
,
"summary_use_proj"
)
else
"last"
self
.
summary_type
=
config
.
summary_type
if
hasattr
(
config
,
"summary_use_proj"
)
else
"last"
if
self
.
summary_type
==
"attn"
:
if
self
.
summary_type
==
"attn"
:
...
...
src/transformers/modeling_tf_xlm.py
View file @
dc17f2a1
...
@@ -97,7 +97,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
...
@@ -97,7 +97,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
NEW_ID
=
itertools
.
count
()
NEW_ID
=
itertools
.
count
()
def
__init__
(
self
,
n_heads
,
dim
,
config
,
**
kwargs
):
def
__init__
(
self
,
n_heads
,
dim
,
config
,
**
kwargs
):
super
(
TFMultiHeadAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
layer_id
=
next
(
TFMultiHeadAttention
.
NEW_ID
)
self
.
layer_id
=
next
(
TFMultiHeadAttention
.
NEW_ID
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
dim
=
dim
self
.
dim
=
dim
...
@@ -182,7 +182,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
...
@@ -182,7 +182,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
class
TFTransformerFFN
(
tf
.
keras
.
layers
.
Layer
):
class
TFTransformerFFN
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
in_dim
,
dim_hidden
,
out_dim
,
config
,
**
kwargs
):
def
__init__
(
self
,
in_dim
,
dim_hidden
,
out_dim
,
config
,
**
kwargs
):
super
(
TFTransformerFFN
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
lin1
=
tf
.
keras
.
layers
.
Dense
(
dim_hidden
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
"lin1"
)
self
.
lin1
=
tf
.
keras
.
layers
.
Dense
(
dim_hidden
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
"lin1"
)
self
.
lin2
=
tf
.
keras
.
layers
.
Dense
(
out_dim
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
"lin2"
)
self
.
lin2
=
tf
.
keras
.
layers
.
Dense
(
out_dim
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
"lin2"
)
self
.
act
=
tf
.
keras
.
layers
.
Activation
(
gelu
)
if
config
.
gelu_activation
else
tf
.
keras
.
activations
.
relu
self
.
act
=
tf
.
keras
.
layers
.
Activation
(
gelu
)
if
config
.
gelu_activation
else
tf
.
keras
.
activations
.
relu
...
@@ -198,7 +198,7 @@ class TFTransformerFFN(tf.keras.layers.Layer):
...
@@ -198,7 +198,7 @@ class TFTransformerFFN(tf.keras.layers.Layer):
class
TFXLMMainLayer
(
tf
.
keras
.
layers
.
Layer
):
class
TFXLMMainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFXLMMainLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
...
@@ -608,7 +608,7 @@ class TFXLMModel(TFXLMPreTrainedModel):
...
@@ -608,7 +608,7 @@ class TFXLMModel(TFXLMPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLMModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
"transformer"
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
...
@@ -622,7 +622,7 @@ class TFXLMPredLayer(tf.keras.layers.Layer):
...
@@ -622,7 +622,7 @@ class TFXLMPredLayer(tf.keras.layers.Layer):
"""
"""
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFXLMPredLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
asm
=
config
.
asm
self
.
asm
=
config
.
asm
self
.
n_words
=
config
.
n_words
self
.
n_words
=
config
.
n_words
self
.
pad_index
=
config
.
pad_index
self
.
pad_index
=
config
.
pad_index
...
@@ -641,7 +641,7 @@ class TFXLMPredLayer(tf.keras.layers.Layer):
...
@@ -641,7 +641,7 @@ class TFXLMPredLayer(tf.keras.layers.Layer):
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
# The output weights are the same as the input embeddings, but there is an output-only bias for each token.
# The output weights are the same as the input embeddings, but there is an output-only bias for each token.
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
n_words
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
n_words
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
super
(
TFXLMPredLayer
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
hidden_states
):
def
call
(
self
,
hidden_states
):
hidden_states
=
self
.
input_embeddings
(
hidden_states
,
mode
=
"linear"
)
hidden_states
=
self
.
input_embeddings
(
hidden_states
,
mode
=
"linear"
)
...
@@ -682,7 +682,7 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
...
@@ -682,7 +682,7 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLMWithLMHeadModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
"transformer"
)
self
.
pred_layer
=
TFXLMPredLayer
(
config
,
self
.
transformer
.
embeddings
,
name
=
"pred_layer_._proj"
)
self
.
pred_layer
=
TFXLMPredLayer
(
config
,
self
.
transformer
.
embeddings
,
name
=
"pred_layer_._proj"
)
...
@@ -733,7 +733,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
...
@@ -733,7 +733,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLMForSequenceClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
"transformer"
)
...
@@ -784,7 +784,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
...
@@ -784,7 +784,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLMForQuestionAnsweringSimple
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
"transformer"
)
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
"qa_outputs"
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
"qa_outputs"
...
...
src/transformers/modeling_tf_xlnet.py
View file @
dc17f2a1
...
@@ -57,7 +57,7 @@ ACT2FN = {
...
@@ -57,7 +57,7 @@ ACT2FN = {
class
TFXLNetRelativeAttention
(
tf
.
keras
.
layers
.
Layer
):
class
TFXLNetRelativeAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFXLNetRelativeAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
if
config
.
d_model
%
config
.
n_head
!=
0
:
if
config
.
d_model
%
config
.
n_head
!=
0
:
...
@@ -104,7 +104,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
...
@@ -104,7 +104,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
self
.
seg_embed
=
self
.
add_weight
(
self
.
seg_embed
=
self
.
add_weight
(
shape
=
(
2
,
self
.
n_head
,
self
.
d_head
),
initializer
=
initializer
,
trainable
=
True
,
name
=
"seg_embed"
shape
=
(
2
,
self
.
n_head
,
self
.
d_head
),
initializer
=
initializer
,
trainable
=
True
,
name
=
"seg_embed"
)
)
super
(
TFXLNetRelativeAttention
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
prune_heads
(
self
,
heads
):
def
prune_heads
(
self
,
heads
):
raise
NotImplementedError
raise
NotImplementedError
...
@@ -280,7 +280,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
...
@@ -280,7 +280,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
class
TFXLNetFeedForward
(
tf
.
keras
.
layers
.
Layer
):
class
TFXLNetFeedForward
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFXLNetFeedForward
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_eps
,
name
=
"layer_norm"
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_eps
,
name
=
"layer_norm"
)
self
.
layer_1
=
tf
.
keras
.
layers
.
Dense
(
self
.
layer_1
=
tf
.
keras
.
layers
.
Dense
(
config
.
d_inner
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"layer_1"
config
.
d_inner
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"layer_1"
...
@@ -307,7 +307,7 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
...
@@ -307,7 +307,7 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
class
TFXLNetLayer
(
tf
.
keras
.
layers
.
Layer
):
class
TFXLNetLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFXLNetLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
rel_attn
=
TFXLNetRelativeAttention
(
config
,
name
=
"rel_attn"
)
self
.
rel_attn
=
TFXLNetRelativeAttention
(
config
,
name
=
"rel_attn"
)
self
.
ff
=
TFXLNetFeedForward
(
config
,
name
=
"ff"
)
self
.
ff
=
TFXLNetFeedForward
(
config
,
name
=
"ff"
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
...
@@ -326,7 +326,7 @@ class TFXLNetLayer(tf.keras.layers.Layer):
...
@@ -326,7 +326,7 @@ class TFXLNetLayer(tf.keras.layers.Layer):
class
TFXLNetLMHead
(
tf
.
keras
.
layers
.
Layer
):
class
TFXLNetLMHead
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFXLNetLMHead
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
# The output weights are the same as the input embeddings, but there is
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
# an output-only bias for each token.
...
@@ -334,7 +334,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer):
...
@@ -334,7 +334,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer):
def
build
(
self
,
input_shape
):
def
build
(
self
,
input_shape
):
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
super
(
TFXLNetLMHead
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
hidden_states
):
def
call
(
self
,
hidden_states
):
hidden_states
=
self
.
input_embeddings
(
hidden_states
,
mode
=
"linear"
)
hidden_states
=
self
.
input_embeddings
(
hidden_states
,
mode
=
"linear"
)
...
@@ -344,7 +344,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer):
...
@@ -344,7 +344,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer):
class
TFXLNetMainLayer
(
tf
.
keras
.
layers
.
Layer
):
class
TFXLNetMainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFXLNetMainLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_past
=
config
.
output_past
self
.
output_past
=
config
.
output_past
...
@@ -832,7 +832,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
...
@@ -832,7 +832,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLNetModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
def
call
(
self
,
inputs
,
**
kwargs
):
def
call
(
self
,
inputs
,
**
kwargs
):
...
@@ -885,7 +885,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
...
@@ -885,7 +885,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLNetLMHeadModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
self
.
lm_loss
=
TFXLNetLMHead
(
config
,
self
.
transformer
.
word_embedding
,
name
=
"lm_loss"
)
self
.
lm_loss
=
TFXLNetLMHead
(
config
,
self
.
transformer
.
word_embedding
,
name
=
"lm_loss"
)
...
@@ -940,7 +940,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
...
@@ -940,7 +940,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLNetForSequenceClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
...
@@ -1001,7 +1001,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
...
@@ -1001,7 +1001,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLNetForTokenClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
...
@@ -1058,7 +1058,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
...
@@ -1058,7 +1058,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLNetForQuestionAnsweringSimple
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"qa_outputs"
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"qa_outputs"
...
@@ -1127,7 +1127,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
...
@@ -1127,7 +1127,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
# """
# """
# def __init__(self, config, *inputs, **kwargs):
# def __init__(self, config, *inputs, **kwargs):
# super(
TFXLNetForQuestionAnswering, self
).__init__(config, *inputs, **kwargs)
# super().__init__(config, *inputs, **kwargs)
# self.start_n_top = config.start_n_top
# self.start_n_top = config.start_n_top
# self.end_n_top = config.end_n_top
# self.end_n_top = config.end_n_top
...
...
src/transformers/modeling_transfo_xl.py
View file @
dc17f2a1
...
@@ -165,7 +165,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
...
@@ -165,7 +165,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
class
PositionalEmbedding
(
nn
.
Module
):
class
PositionalEmbedding
(
nn
.
Module
):
def
__init__
(
self
,
demb
):
def
__init__
(
self
,
demb
):
super
(
PositionalEmbedding
,
self
).
__init__
()
super
().
__init__
()
self
.
demb
=
demb
self
.
demb
=
demb
...
@@ -184,7 +184,7 @@ class PositionalEmbedding(nn.Module):
...
@@ -184,7 +184,7 @@ class PositionalEmbedding(nn.Module):
class
PositionwiseFF
(
nn
.
Module
):
class
PositionwiseFF
(
nn
.
Module
):
def
__init__
(
self
,
d_model
,
d_inner
,
dropout
,
pre_lnorm
=
False
,
layer_norm_epsilon
=
1e-5
):
def
__init__
(
self
,
d_model
,
d_inner
,
dropout
,
pre_lnorm
=
False
,
layer_norm_epsilon
=
1e-5
):
super
(
PositionwiseFF
,
self
).
__init__
()
super
().
__init__
()
self
.
d_model
=
d_model
self
.
d_model
=
d_model
self
.
d_inner
=
d_inner
self
.
d_inner
=
d_inner
...
@@ -236,7 +236,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
...
@@ -236,7 +236,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
output_attentions
=
False
,
output_attentions
=
False
,
layer_norm_epsilon
=
1e-5
,
layer_norm_epsilon
=
1e-5
,
):
):
super
(
RelPartialLearnableMultiHeadAttn
,
self
).
__init__
()
super
().
__init__
()
self
.
output_attentions
=
output_attentions
self
.
output_attentions
=
output_attentions
self
.
n_head
=
n_head
self
.
n_head
=
n_head
...
@@ -368,7 +368,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
...
@@ -368,7 +368,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
class
RelPartialLearnableDecoderLayer
(
nn
.
Module
):
class
RelPartialLearnableDecoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
n_head
,
d_model
,
d_head
,
d_inner
,
dropout
,
layer_norm_epsilon
=
1e-5
,
**
kwargs
):
def
__init__
(
self
,
n_head
,
d_model
,
d_head
,
d_inner
,
dropout
,
layer_norm_epsilon
=
1e-5
,
**
kwargs
):
super
(
RelPartialLearnableDecoderLayer
,
self
).
__init__
()
super
().
__init__
()
self
.
dec_attn
=
RelPartialLearnableMultiHeadAttn
(
self
.
dec_attn
=
RelPartialLearnableMultiHeadAttn
(
n_head
,
d_model
,
d_head
,
dropout
,
layer_norm_epsilon
=
layer_norm_epsilon
,
**
kwargs
n_head
,
d_model
,
d_head
,
dropout
,
layer_norm_epsilon
=
layer_norm_epsilon
,
**
kwargs
...
@@ -389,7 +389,7 @@ class RelPartialLearnableDecoderLayer(nn.Module):
...
@@ -389,7 +389,7 @@ class RelPartialLearnableDecoderLayer(nn.Module):
class
AdaptiveEmbedding
(
nn
.
Module
):
class
AdaptiveEmbedding
(
nn
.
Module
):
def
__init__
(
self
,
n_token
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
sample_softmax
=
False
):
def
__init__
(
self
,
n_token
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
sample_softmax
=
False
):
super
(
AdaptiveEmbedding
,
self
).
__init__
()
super
().
__init__
()
self
.
n_token
=
n_token
self
.
n_token
=
n_token
self
.
d_embed
=
d_embed
self
.
d_embed
=
d_embed
...
@@ -587,7 +587,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
...
@@ -587,7 +587,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
TransfoXLModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
...
@@ -845,7 +845,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
...
@@ -845,7 +845,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
(
TransfoXLLMHeadModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
transformer
=
TransfoXLModel
(
config
)
self
.
transformer
=
TransfoXLModel
(
config
)
self
.
sample_softmax
=
config
.
sample_softmax
self
.
sample_softmax
=
config
.
sample_softmax
# use sampled softmax
# use sampled softmax
...
...
src/transformers/modeling_transfo_xl_utilities.py
View file @
dc17f2a1
...
@@ -29,7 +29,7 @@ import torch.nn.functional as F
...
@@ -29,7 +29,7 @@ import torch.nn.functional as F
class
ProjectedAdaptiveLogSoftmax
(
nn
.
Module
):
class
ProjectedAdaptiveLogSoftmax
(
nn
.
Module
):
def
__init__
(
self
,
n_token
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
keep_order
=
False
):
def
__init__
(
self
,
n_token
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
keep_order
=
False
):
super
(
ProjectedAdaptiveLogSoftmax
,
self
).
__init__
()
super
().
__init__
()
self
.
n_token
=
n_token
self
.
n_token
=
n_token
self
.
d_embed
=
d_embed
self
.
d_embed
=
d_embed
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment