Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
dc17f2a1
Unverified
Commit
dc17f2a1
authored
Jan 16, 2020
by
Thomas Wolf
Committed by
GitHub
Jan 16, 2020
Browse files
Merge pull request #2538 from huggingface/py3_super
💄
super
parents
88085484
a98b2ca8
Changes
75
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
177 additions
and
179 deletions
+177
-179
src/transformers/modeling_gpt2.py
src/transformers/modeling_gpt2.py
+7
-7
src/transformers/modeling_mmbt.py
src/transformers/modeling_mmbt.py
+3
-3
src/transformers/modeling_openai.py
src/transformers/modeling_openai.py
+6
-6
src/transformers/modeling_roberta.py
src/transformers/modeling_roberta.py
+10
-10
src/transformers/modeling_t5.py
src/transformers/modeling_t5.py
+10
-10
src/transformers/modeling_tf_albert.py
src/transformers/modeling_tf_albert.py
+13
-13
src/transformers/modeling_tf_bert.py
src/transformers/modeling_tf_bert.py
+24
-24
src/transformers/modeling_tf_ctrl.py
src/transformers/modeling_tf_ctrl.py
+7
-7
src/transformers/modeling_tf_distilbert.py
src/transformers/modeling_tf_distilbert.py
+14
-14
src/transformers/modeling_tf_gpt2.py
src/transformers/modeling_tf_gpt2.py
+7
-7
src/transformers/modeling_tf_openai.py
src/transformers/modeling_tf_openai.py
+7
-7
src/transformers/modeling_tf_roberta.py
src/transformers/modeling_tf_roberta.py
+10
-12
src/transformers/modeling_tf_t5.py
src/transformers/modeling_tf_t5.py
+11
-11
src/transformers/modeling_tf_transfo_xl.py
src/transformers/modeling_tf_transfo_xl.py
+11
-11
src/transformers/modeling_tf_transfo_xl_utilities.py
src/transformers/modeling_tf_transfo_xl_utilities.py
+2
-2
src/transformers/modeling_tf_utils.py
src/transformers/modeling_tf_utils.py
+5
-5
src/transformers/modeling_tf_xlm.py
src/transformers/modeling_tf_xlm.py
+9
-9
src/transformers/modeling_tf_xlnet.py
src/transformers/modeling_tf_xlnet.py
+13
-13
src/transformers/modeling_transfo_xl.py
src/transformers/modeling_transfo_xl.py
+7
-7
src/transformers/modeling_transfo_xl_utilities.py
src/transformers/modeling_transfo_xl_utilities.py
+1
-1
No files found.
src/transformers/modeling_gpt2.py
View file @
dc17f2a1
...
...
@@ -101,7 +101,7 @@ def gelu(x):
class
Attention
(
nn
.
Module
):
def
__init__
(
self
,
nx
,
n_ctx
,
config
,
scale
=
False
):
super
(
Attention
,
self
).
__init__
()
super
().
__init__
()
self
.
output_attentions
=
config
.
output_attentions
n_state
=
nx
# in Attention: n_state=768 (nx=n_embd)
...
...
@@ -202,7 +202,7 @@ class Attention(nn.Module):
class
MLP
(
nn
.
Module
):
def
__init__
(
self
,
n_state
,
config
):
# in MLP: n_state=3072 (4 * n_embd)
super
(
MLP
,
self
).
__init__
()
super
().
__init__
()
nx
=
config
.
n_embd
self
.
c_fc
=
Conv1D
(
n_state
,
nx
)
self
.
c_proj
=
Conv1D
(
nx
,
n_state
)
...
...
@@ -217,7 +217,7 @@ class MLP(nn.Module):
class
Block
(
nn
.
Module
):
def
__init__
(
self
,
n_ctx
,
config
,
scale
=
False
):
super
(
Block
,
self
).
__init__
()
super
().
__init__
()
nx
=
config
.
n_embd
self
.
ln_1
=
nn
.
LayerNorm
(
nx
,
eps
=
config
.
layer_norm_epsilon
)
self
.
attn
=
Attention
(
nx
,
n_ctx
,
config
,
scale
)
...
...
@@ -249,7 +249,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
base_model_prefix
=
"transformer"
def
__init__
(
self
,
*
inputs
,
**
kwargs
):
super
(
GPT2PreTrainedModel
,
self
).
__init__
(
*
inputs
,
**
kwargs
)
super
().
__init__
(
*
inputs
,
**
kwargs
)
def
_init_weights
(
self
,
module
):
""" Initialize the weights.
...
...
@@ -355,7 +355,7 @@ class GPT2Model(GPT2PreTrainedModel):
"""
def
__init__
(
self
,
config
):
super
(
GPT2Model
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_attentions
=
config
.
output_attentions
self
.
output_past
=
config
.
output_past
...
...
@@ -550,7 +550,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
"""
def
__init__
(
self
,
config
):
super
(
GPT2LMHeadModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
transformer
=
GPT2Model
(
config
)
self
.
lm_head
=
nn
.
Linear
(
config
.
n_embd
,
config
.
vocab_size
,
bias
=
False
)
...
...
@@ -678,7 +678,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
"""
def
__init__
(
self
,
config
):
super
(
GPT2DoubleHeadsModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
config
.
num_labels
=
1
self
.
transformer
=
GPT2Model
(
config
)
self
.
lm_head
=
nn
.
Linear
(
config
.
n_embd
,
config
.
vocab_size
,
bias
=
False
)
...
...
src/transformers/modeling_mmbt.py
View file @
dc17f2a1
...
...
@@ -33,7 +33,7 @@ class ModalEmbeddings(nn.Module):
"""
def
__init__
(
self
,
config
,
encoder
,
embeddings
):
super
(
ModalEmbeddings
,
self
).
__init__
()
super
().
__init__
()
self
.
config
=
config
self
.
encoder
=
encoder
self
.
proj_embeddings
=
nn
.
Linear
(
config
.
modal_hidden_size
,
config
.
hidden_size
)
...
...
@@ -175,7 +175,7 @@ class MMBTModel(nn.Module):
"""
def
__init__
(
self
,
config
,
transformer
,
encoder
):
super
(
MMBTModel
,
self
).
__init__
()
super
().
__init__
()
self
.
config
=
config
self
.
transformer
=
transformer
self
.
modal_encoder
=
ModalEmbeddings
(
config
,
encoder
,
transformer
.
embeddings
)
...
...
@@ -359,7 +359,7 @@ class MMBTForClassification(nn.Module):
"""
def
__init__
(
self
,
config
,
transformer
,
encoder
):
super
(
MMBTForClassification
,
self
).
__init__
()
super
().
__init__
()
self
.
num_labels
=
config
.
num_labels
self
.
mmbt
=
MMBTModel
(
config
,
transformer
,
encoder
)
...
...
src/transformers/modeling_openai.py
View file @
dc17f2a1
...
...
@@ -127,7 +127,7 @@ ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
class
Attention
(
nn
.
Module
):
def
__init__
(
self
,
nx
,
n_ctx
,
config
,
scale
=
False
):
super
(
Attention
,
self
).
__init__
()
super
().
__init__
()
n_state
=
nx
# in Attention: n_state=768 (nx=n_embd)
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
assert
n_state
%
config
.
n_head
==
0
...
...
@@ -221,7 +221,7 @@ class Attention(nn.Module):
class
MLP
(
nn
.
Module
):
def
__init__
(
self
,
n_state
,
config
):
# in MLP: n_state=3072 (4 * n_embd)
super
(
MLP
,
self
).
__init__
()
super
().
__init__
()
nx
=
config
.
n_embd
self
.
c_fc
=
Conv1D
(
n_state
,
nx
)
self
.
c_proj
=
Conv1D
(
nx
,
n_state
)
...
...
@@ -236,7 +236,7 @@ class MLP(nn.Module):
class
Block
(
nn
.
Module
):
def
__init__
(
self
,
n_ctx
,
config
,
scale
=
False
):
super
(
Block
,
self
).
__init__
()
super
().
__init__
()
nx
=
config
.
n_embd
self
.
attn
=
Attention
(
nx
,
n_ctx
,
config
,
scale
)
self
.
ln_1
=
nn
.
LayerNorm
(
nx
,
eps
=
config
.
layer_norm_epsilon
)
...
...
@@ -359,7 +359,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
"""
def
__init__
(
self
,
config
):
super
(
OpenAIGPTModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
...
...
@@ -518,7 +518,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
"""
def
__init__
(
self
,
config
):
super
(
OpenAIGPTLMHeadModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
transformer
=
OpenAIGPTModel
(
config
)
self
.
lm_head
=
nn
.
Linear
(
config
.
n_embd
,
config
.
vocab_size
,
bias
=
False
)
...
...
@@ -623,7 +623,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
"""
def
__init__
(
self
,
config
):
super
(
OpenAIGPTDoubleHeadsModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
config
.
num_labels
=
1
self
.
transformer
=
OpenAIGPTModel
(
config
)
...
...
src/transformers/modeling_roberta.py
View file @
dc17f2a1
...
...
@@ -45,7 +45,7 @@ class RobertaEmbeddings(BertEmbeddings):
"""
def
__init__
(
self
,
config
):
super
(
RobertaEmbeddings
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
padding_idx
=
1
self
.
word_embeddings
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
hidden_size
,
padding_idx
=
self
.
padding_idx
)
self
.
position_embeddings
=
nn
.
Embedding
(
...
...
@@ -60,7 +60,7 @@ class RobertaEmbeddings(BertEmbeddings):
else
:
position_ids
=
self
.
create_position_ids_from_inputs_embeds
(
inputs_embeds
)
return
super
(
RobertaEmbeddings
,
self
).
forward
(
return
super
().
forward
(
input_ids
,
token_type_ids
=
token_type_ids
,
position_ids
=
position_ids
,
inputs_embeds
=
inputs_embeds
)
...
...
@@ -204,7 +204,7 @@ class RobertaModel(BertModel):
base_model_prefix
=
"roberta"
def
__init__
(
self
,
config
):
super
(
RobertaModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
embeddings
=
RobertaEmbeddings
(
config
)
self
.
init_weights
()
...
...
@@ -254,7 +254,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
base_model_prefix
=
"roberta"
def
__init__
(
self
,
config
):
super
(
RobertaForMaskedLM
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
roberta
=
RobertaModel
(
config
)
self
.
lm_head
=
RobertaLMHead
(
config
)
...
...
@@ -299,7 +299,7 @@ class RobertaLMHead(nn.Module):
"""Roberta Head for masked language modeling."""
def
__init__
(
self
,
config
):
super
(
RobertaLMHead
,
self
).
__init__
()
super
().
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
layer_norm
=
BertLayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
...
...
@@ -362,7 +362,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
base_model_prefix
=
"roberta"
def
__init__
(
self
,
config
):
super
(
RobertaForSequenceClassification
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
num_labels
=
config
.
num_labels
self
.
roberta
=
RobertaModel
(
config
)
...
...
@@ -484,7 +484,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
base_model_prefix
=
"roberta"
def
__init__
(
self
,
config
):
super
(
RobertaForMultipleChoice
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
roberta
=
RobertaModel
(
config
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
...
...
@@ -571,7 +571,7 @@ class RobertaForTokenClassification(BertPreTrainedModel):
base_model_prefix
=
"roberta"
def
__init__
(
self
,
config
):
super
(
RobertaForTokenClassification
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
num_labels
=
config
.
num_labels
self
.
roberta
=
RobertaModel
(
config
)
...
...
@@ -625,7 +625,7 @@ class RobertaClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
def
__init__
(
self
,
config
):
super
(
RobertaClassificationHead
,
self
).
__init__
()
super
().
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
out_proj
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
num_labels
)
...
...
@@ -684,7 +684,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel):
base_model_prefix
=
"roberta"
def
__init__
(
self
,
config
):
super
(
RobertaForQuestionAnswering
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
num_labels
=
config
.
num_labels
self
.
roberta
=
RobertaModel
(
config
)
...
...
src/transformers/modeling_t5.py
View file @
dc17f2a1
...
...
@@ -142,7 +142,7 @@ class T5LayerNorm(nn.Module):
""" Construct a layernorm module in the T5 style
No bias and no substraction of mean.
"""
super
(
T5LayerNorm
,
self
).
__init__
()
super
().
__init__
()
self
.
weight
=
nn
.
Parameter
(
torch
.
ones
(
hidden_size
))
self
.
variance_epsilon
=
eps
...
...
@@ -154,7 +154,7 @@ class T5LayerNorm(nn.Module):
class
T5DenseReluDense
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
T5DenseReluDense
,
self
).
__init__
()
super
().
__init__
()
self
.
wi
=
nn
.
Linear
(
config
.
d_model
,
config
.
d_ff
,
bias
=
False
)
self
.
wo
=
nn
.
Linear
(
config
.
d_ff
,
config
.
d_model
,
bias
=
False
)
self
.
dropout
=
nn
.
Dropout
(
config
.
dropout_rate
)
...
...
@@ -169,7 +169,7 @@ class T5DenseReluDense(nn.Module):
class
T5LayerFF
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
T5LayerFF
,
self
).
__init__
()
super
().
__init__
()
self
.
DenseReluDense
=
T5DenseReluDense
(
config
)
self
.
layer_norm
=
T5LayerNorm
(
config
.
d_model
,
eps
=
config
.
layer_norm_epsilon
)
self
.
dropout
=
nn
.
Dropout
(
config
.
dropout_rate
)
...
...
@@ -185,7 +185,7 @@ class T5Attention(nn.Module):
NEW_ID
=
itertools
.
count
()
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
):
super
(
T5Attention
,
self
).
__init__
()
super
().
__init__
()
self
.
layer_id
=
next
(
T5Attention
.
NEW_ID
)
self
.
is_decoder
=
config
.
is_decoder
self
.
has_relative_attention_bias
=
has_relative_attention_bias
...
...
@@ -363,7 +363,7 @@ class T5Attention(nn.Module):
class
T5LayerSelfAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
):
super
(
T5LayerSelfAttention
,
self
).
__init__
()
super
().
__init__
()
self
.
SelfAttention
=
T5Attention
(
config
,
has_relative_attention_bias
=
has_relative_attention_bias
)
self
.
layer_norm
=
T5LayerNorm
(
config
.
d_model
,
eps
=
config
.
layer_norm_epsilon
)
self
.
dropout
=
nn
.
Dropout
(
config
.
dropout_rate
)
...
...
@@ -381,7 +381,7 @@ class T5LayerSelfAttention(nn.Module):
class
T5LayerCrossAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
):
super
(
T5LayerCrossAttention
,
self
).
__init__
()
super
().
__init__
()
self
.
EncDecAttention
=
T5Attention
(
config
,
has_relative_attention_bias
=
has_relative_attention_bias
)
self
.
layer_norm
=
T5LayerNorm
(
config
.
d_model
,
eps
=
config
.
layer_norm_epsilon
)
self
.
dropout
=
nn
.
Dropout
(
config
.
dropout_rate
)
...
...
@@ -399,7 +399,7 @@ class T5LayerCrossAttention(nn.Module):
class
T5Block
(
nn
.
Module
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
):
super
(
T5Block
,
self
).
__init__
()
super
().
__init__
()
self
.
is_decoder
=
config
.
is_decoder
self
.
layer
=
nn
.
ModuleList
()
self
.
layer
.
append
(
T5LayerSelfAttention
(
config
,
has_relative_attention_bias
=
has_relative_attention_bias
))
...
...
@@ -501,7 +501,7 @@ class T5PreTrainedModel(PreTrainedModel):
class
T5Stack
(
T5PreTrainedModel
):
def
__init__
(
self
,
config
):
super
(
T5Stack
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
is_decoder
=
config
.
is_decoder
...
...
@@ -724,7 +724,7 @@ class T5Model(T5PreTrainedModel):
"""
def
__init__
(
self
,
config
):
super
(
T5Model
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
shared
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
d_model
)
encoder_config
=
copy
.
deepcopy
(
config
)
...
...
@@ -830,7 +830,7 @@ class T5WithLMHeadModel(T5PreTrainedModel):
"""
def
__init__
(
self
,
config
):
super
(
T5WithLMHeadModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
model_dim
=
config
.
d_model
self
.
shared
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
d_model
)
...
...
src/transformers/modeling_tf_albert.py
View file @
dc17f2a1
...
...
@@ -45,7 +45,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
"""
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFAlbertEmbeddings
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
config
=
config
self
.
position_embeddings
=
tf
.
keras
.
layers
.
Embedding
(
...
...
@@ -76,7 +76,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
shape
=
[
self
.
config
.
vocab_size
,
self
.
config
.
embedding_size
],
initializer
=
get_initializer
(
self
.
config
.
initializer_range
),
)
super
(
TFAlbertEmbeddings
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
inputs
,
mode
=
"embedding"
,
training
=
False
):
"""Get token embeddings of inputs.
...
...
@@ -141,7 +141,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer):
class
TFAlbertSelfAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFAlbertSelfAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
if
config
.
hidden_size
%
config
.
num_attention_heads
!=
0
:
raise
ValueError
(
"The hidden size (%d) is not a multiple of the number of attention "
...
...
@@ -217,7 +217,7 @@ class TFAlbertSelfAttention(tf.keras.layers.Layer):
class
TFAlbertSelfOutput
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFAlbertSelfOutput
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
)
...
...
@@ -235,7 +235,7 @@ class TFAlbertSelfOutput(tf.keras.layers.Layer):
class
TFAlbertAttention
(
TFBertSelfAttention
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFAlbertAttention
,
self
).
__init__
(
config
,
**
kwargs
)
super
().
__init__
(
config
,
**
kwargs
)
self
.
hidden_size
=
config
.
hidden_size
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
...
...
@@ -303,7 +303,7 @@ class TFAlbertAttention(TFBertSelfAttention):
class
TFAlbertLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFAlbertLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
attention
=
TFAlbertAttention
(
config
,
name
=
"attention"
)
self
.
ffn
=
tf
.
keras
.
layers
.
Dense
(
...
...
@@ -341,7 +341,7 @@ class TFAlbertLayer(tf.keras.layers.Layer):
class
TFAlbertLayerGroup
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFAlbertLayerGroup
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
...
...
@@ -376,7 +376,7 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer):
class
TFAlbertTransformer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFAlbertTransformer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
config
=
config
self
.
output_attentions
=
config
.
output_attentions
...
...
@@ -445,7 +445,7 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel):
class
TFAlbertMLMHead
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFAlbertMLMHead
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
...
...
@@ -467,7 +467,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
self
.
decoder_bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"decoder/bias"
)
super
(
TFAlbertMLMHead
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
hidden_states
):
hidden_states
=
self
.
dense
(
hidden_states
)
...
...
@@ -596,7 +596,7 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
"""
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFAlbertModel
,
self
).
__init__
(
config
,
**
kwargs
)
super
().
__init__
(
config
,
**
kwargs
)
self
.
num_hidden_layers
=
config
.
num_hidden_layers
self
.
embeddings
=
TFAlbertEmbeddings
(
config
,
name
=
"embeddings"
)
...
...
@@ -733,7 +733,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFAlbertForMaskedLM
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
albert
=
TFAlbertModel
(
config
,
name
=
"albert"
)
self
.
predictions
=
TFAlbertMLMHead
(
config
,
self
.
albert
.
embeddings
,
name
=
"predictions"
)
...
...
@@ -786,7 +786,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFAlbertForSequenceClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
albert
=
TFAlbertModel
(
config
,
name
=
"albert"
)
...
...
src/transformers/modeling_tf_bert.py
View file @
dc17f2a1
...
...
@@ -93,7 +93,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
"""
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertEmbeddings
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
hidden_size
=
config
.
hidden_size
self
.
initializer_range
=
config
.
initializer_range
...
...
@@ -126,7 +126,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
shape
=
[
self
.
vocab_size
,
self
.
hidden_size
],
initializer
=
get_initializer
(
self
.
initializer_range
),
)
super
(
TFBertEmbeddings
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
inputs
,
mode
=
"embedding"
,
training
=
False
):
"""Get token embeddings of inputs.
...
...
@@ -193,7 +193,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
class
TFBertSelfAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertSelfAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
if
config
.
hidden_size
%
config
.
num_attention_heads
!=
0
:
raise
ValueError
(
"The hidden size (%d) is not a multiple of the number of attention "
...
...
@@ -269,7 +269,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
class
TFBertSelfOutput
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertSelfOutput
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
)
...
...
@@ -287,7 +287,7 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
class
TFBertAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
self_attention
=
TFBertSelfAttention
(
config
,
name
=
"self"
)
self
.
dense_output
=
TFBertSelfOutput
(
config
,
name
=
"output"
)
...
...
@@ -305,7 +305,7 @@ class TFBertAttention(tf.keras.layers.Layer):
class
TFBertIntermediate
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertIntermediate
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
intermediate_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
)
...
...
@@ -322,7 +322,7 @@ class TFBertIntermediate(tf.keras.layers.Layer):
class
TFBertOutput
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertOutput
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
)
...
...
@@ -340,7 +340,7 @@ class TFBertOutput(tf.keras.layers.Layer):
class
TFBertLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
attention
=
TFBertAttention
(
config
,
name
=
"attention"
)
self
.
intermediate
=
TFBertIntermediate
(
config
,
name
=
"intermediate"
)
self
.
bert_output
=
TFBertOutput
(
config
,
name
=
"output"
)
...
...
@@ -358,7 +358,7 @@ class TFBertLayer(tf.keras.layers.Layer):
class
TFBertEncoder
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertEncoder
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
layer
=
[
TFBertLayer
(
config
,
name
=
"layer_._{}"
.
format
(
i
))
for
i
in
range
(
config
.
num_hidden_layers
)]
...
...
@@ -392,7 +392,7 @@ class TFBertEncoder(tf.keras.layers.Layer):
class
TFBertPooler
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertPooler
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
...
...
@@ -410,7 +410,7 @@ class TFBertPooler(tf.keras.layers.Layer):
class
TFBertPredictionHeadTransform
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertPredictionHeadTransform
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
)
...
...
@@ -429,7 +429,7 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
class
TFBertLMPredictionHead
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFBertLMPredictionHead
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
transform
=
TFBertPredictionHeadTransform
(
config
,
name
=
"transform"
)
...
...
@@ -439,7 +439,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
def
build
(
self
,
input_shape
):
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
super
(
TFBertLMPredictionHead
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
hidden_states
):
hidden_states
=
self
.
transform
(
hidden_states
)
...
...
@@ -450,7 +450,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer):
class
TFBertMLMHead
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFBertMLMHead
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
predictions
=
TFBertLMPredictionHead
(
config
,
input_embeddings
,
name
=
"predictions"
)
def
call
(
self
,
sequence_output
):
...
...
@@ -460,7 +460,7 @@ class TFBertMLMHead(tf.keras.layers.Layer):
class
TFBertNSPHead
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertNSPHead
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
seq_relationship
=
tf
.
keras
.
layers
.
Dense
(
2
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"seq_relationship"
)
...
...
@@ -472,7 +472,7 @@ class TFBertNSPHead(tf.keras.layers.Layer):
class
TFBertMainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFBertMainLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
num_hidden_layers
=
config
.
num_hidden_layers
self
.
embeddings
=
TFBertEmbeddings
(
config
,
name
=
"embeddings"
)
...
...
@@ -707,7 +707,7 @@ class TFBertModel(TFBertPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
def
call
(
self
,
inputs
,
**
kwargs
):
...
...
@@ -750,7 +750,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForPreTraining
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
nsp
=
TFBertNSPHead
(
config
,
name
=
"nsp___cls"
)
...
...
@@ -803,7 +803,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForMaskedLM
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
mlm
=
TFBertMLMHead
(
config
,
self
.
bert
.
embeddings
,
name
=
"mlm___cls"
)
...
...
@@ -854,7 +854,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForNextSentencePrediction
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
nsp
=
TFBertNSPHead
(
config
,
name
=
"nsp___cls"
)
...
...
@@ -903,7 +903,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForSequenceClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
...
...
@@ -960,7 +960,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForMultipleChoice
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
hidden_dropout_prob
)
...
...
@@ -1064,7 +1064,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForTokenClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
...
...
@@ -1121,7 +1121,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForQuestionAnswering
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
"bert"
)
...
...
src/transformers/modeling_tf_ctrl.py
View file @
dc17f2a1
...
...
@@ -75,7 +75,7 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N
class
TFMultiHeadAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
d_model_size
,
num_heads
,
output_attentions
=
False
,
**
kwargs
):
super
(
TFMultiHeadAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
output_attentions
self
.
num_heads
=
num_heads
self
.
d_model_size
=
d_model_size
...
...
@@ -132,7 +132,7 @@ class TFEncoderLayer(tf.keras.layers.Layer):
def
__init__
(
self
,
d_model_size
,
num_heads
,
dff
,
rate
=
0.1
,
layer_norm_epsilon
=
1e-6
,
output_attentions
=
False
,
**
kwargs
):
super
(
TFEncoderLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
multi_head_attention
=
TFMultiHeadAttention
(
d_model_size
,
num_heads
,
output_attentions
,
name
=
"multi_head_attention"
...
...
@@ -166,7 +166,7 @@ class TFEncoderLayer(tf.keras.layers.Layer):
class
TFCTRLMainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFCTRLMainLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_attentions
=
config
.
output_attentions
self
.
output_past
=
config
.
output_past
...
...
@@ -443,7 +443,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFCTRLModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFCTRLMainLayer
(
config
,
name
=
"transformer"
)
def
call
(
self
,
inputs
,
**
kwargs
):
...
...
@@ -453,7 +453,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel):
class
TFCTRLLMHead
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFCTRLLMHead
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
# The output weights are the same as the input embeddings, but there is
...
...
@@ -462,7 +462,7 @@ class TFCTRLLMHead(tf.keras.layers.Layer):
def
build
(
self
,
input_shape
):
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
super
(
TFCTRLLMHead
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
hidden_states
):
hidden_states
=
self
.
input_embeddings
(
hidden_states
,
mode
=
"linear"
)
...
...
@@ -508,7 +508,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFCTRLLMHeadModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFCTRLMainLayer
(
config
,
name
=
"transformer"
)
self
.
lm_head
=
TFCTRLLMHead
(
config
,
self
.
transformer
.
w
,
name
=
"lm_head"
)
...
...
src/transformers/modeling_tf_distilbert.py
View file @
dc17f2a1
...
...
@@ -65,7 +65,7 @@ def gelu_new(x):
class
TFEmbeddings
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFEmbeddings
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
dim
=
config
.
dim
self
.
initializer_range
=
config
.
initializer_range
...
...
@@ -92,7 +92,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
self
.
word_embeddings
=
self
.
add_weight
(
"weight"
,
shape
=
[
self
.
vocab_size
,
self
.
dim
],
initializer
=
get_initializer
(
self
.
initializer_range
)
)
super
(
TFEmbeddings
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
inputs
,
inputs_embeds
=
None
,
mode
=
"embedding"
,
training
=
False
):
"""Get token embeddings of inputs.
...
...
@@ -169,7 +169,7 @@ class TFEmbeddings(tf.keras.layers.Layer):
class
TFMultiHeadSelfAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFMultiHeadSelfAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
n_heads
=
config
.
n_heads
self
.
dim
=
config
.
dim
...
...
@@ -259,7 +259,7 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
class
TFFFN
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFFFN
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
self
.
lin1
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_dim
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"lin1"
...
...
@@ -284,7 +284,7 @@ class TFFFN(tf.keras.layers.Layer):
class
TFTransformerBlock
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFTransformerBlock
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
n_heads
=
config
.
n_heads
self
.
dim
=
config
.
dim
...
...
@@ -338,7 +338,7 @@ class TFTransformerBlock(tf.keras.layers.Layer):
class
TFTransformer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFTransformer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
n_layers
=
config
.
n_layers
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
...
...
@@ -399,7 +399,7 @@ class TFTransformer(tf.keras.layers.Layer):
class
TFDistilBertMainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFDistilBertMainLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
num_hidden_layers
=
config
.
num_hidden_layers
self
.
embeddings
=
TFEmbeddings
(
config
,
name
=
"embeddings"
)
# Embeddings
...
...
@@ -569,7 +569,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFDistilBertModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
# Embeddings
def
call
(
self
,
inputs
,
**
kwargs
):
...
...
@@ -579,7 +579,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel):
class
TFDistilBertLMHead
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFDistilBertLMHead
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
# The output weights are the same as the input embeddings, but there is
...
...
@@ -588,7 +588,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer):
def
build
(
self
,
input_shape
):
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
super
(
TFDistilBertLMHead
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
hidden_states
):
hidden_states
=
self
.
input_embeddings
(
hidden_states
,
mode
=
"linear"
)
...
...
@@ -628,7 +628,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFDistilBertForMaskedLM
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
vocab_size
=
config
.
vocab_size
...
...
@@ -690,7 +690,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFDistilBertForSequenceClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
...
...
@@ -747,7 +747,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFDistilBertForTokenClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
...
...
@@ -804,7 +804,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFDistilBertForQuestionAnswering
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
distilbert
=
TFDistilBertMainLayer
(
config
,
name
=
"distilbert"
)
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
...
...
src/transformers/modeling_tf_gpt2.py
View file @
dc17f2a1
...
...
@@ -58,7 +58,7 @@ def gelu(x):
class
TFAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
nx
,
n_ctx
,
config
,
scale
=
False
,
**
kwargs
):
super
(
TFAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
n_state
=
nx
# in Attention: n_state=768 (nx=n_embd)
...
...
@@ -157,7 +157,7 @@ class TFAttention(tf.keras.layers.Layer):
class
TFMLP
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
n_state
,
config
,
**
kwargs
):
super
(
TFMLP
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
nx
=
config
.
n_embd
self
.
c_fc
=
TFConv1D
(
n_state
,
nx
,
initializer_range
=
config
.
initializer_range
,
name
=
"c_fc"
)
self
.
c_proj
=
TFConv1D
(
nx
,
n_state
,
initializer_range
=
config
.
initializer_range
,
name
=
"c_proj"
)
...
...
@@ -173,7 +173,7 @@ class TFMLP(tf.keras.layers.Layer):
class
TFBlock
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
n_ctx
,
config
,
scale
=
False
,
**
kwargs
):
super
(
TFBlock
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
nx
=
config
.
n_embd
self
.
ln_1
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_epsilon
,
name
=
"ln_1"
)
self
.
attn
=
TFAttention
(
nx
,
n_ctx
,
config
,
scale
,
name
=
"attn"
)
...
...
@@ -198,7 +198,7 @@ class TFBlock(tf.keras.layers.Layer):
class
TFGPT2MainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFGPT2MainLayer
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_attentions
=
config
.
output_attentions
self
.
num_hidden_layers
=
config
.
n_layer
...
...
@@ -475,7 +475,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFGPT2Model
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFGPT2MainLayer
(
config
,
name
=
"transformer"
)
def
call
(
self
,
inputs
,
**
kwargs
):
...
...
@@ -521,7 +521,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFGPT2LMHeadModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFGPT2MainLayer
(
config
,
name
=
"transformer"
)
def
get_output_embeddings
(
self
):
...
...
@@ -598,7 +598,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFGPT2DoubleHeadsModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
config
.
num_labels
=
1
self
.
transformer
=
TFGPT2MainLayer
(
config
,
name
=
"transformer"
)
self
.
multiple_choice_head
=
TFSequenceSummary
(
...
...
src/transformers/modeling_tf_openai.py
View file @
dc17f2a1
...
...
@@ -66,7 +66,7 @@ ACT_FNS = {
class
TFAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
nx
,
n_ctx
,
config
,
scale
=
False
,
**
kwargs
):
super
(
TFAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
n_state
=
nx
# in Attention: n_state=768 (nx=n_embd)
...
...
@@ -160,7 +160,7 @@ class TFAttention(tf.keras.layers.Layer):
class
TFMLP
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
n_state
,
config
,
**
kwargs
):
super
(
TFMLP
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
nx
=
config
.
n_embd
self
.
c_fc
=
TFConv1D
(
n_state
,
nx
,
initializer_range
=
config
.
initializer_range
,
name
=
"c_fc"
)
self
.
c_proj
=
TFConv1D
(
nx
,
n_state
,
initializer_range
=
config
.
initializer_range
,
name
=
"c_proj"
)
...
...
@@ -176,7 +176,7 @@ class TFMLP(tf.keras.layers.Layer):
class
TFBlock
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
n_ctx
,
config
,
scale
=
False
,
**
kwargs
):
super
(
TFBlock
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
nx
=
config
.
n_embd
self
.
attn
=
TFAttention
(
nx
,
n_ctx
,
config
,
scale
,
name
=
"attn"
)
self
.
ln_1
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_epsilon
,
name
=
"ln_1"
)
...
...
@@ -199,7 +199,7 @@ class TFBlock(tf.keras.layers.Layer):
class
TFOpenAIGPTMainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFOpenAIGPTMainLayer
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_attentions
=
config
.
output_attentions
self
.
num_hidden_layers
=
config
.
n_layer
...
...
@@ -453,7 +453,7 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFOpenAIGPTModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFOpenAIGPTMainLayer
(
config
,
name
=
"transformer"
)
def
call
(
self
,
inputs
,
**
kwargs
):
...
...
@@ -494,7 +494,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFOpenAIGPTLMHeadModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFOpenAIGPTMainLayer
(
config
,
name
=
"transformer"
)
def
get_output_embeddings
(
self
):
...
...
@@ -563,7 +563,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFOpenAIGPTDoubleHeadsModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
config
.
num_labels
=
1
self
.
transformer
=
TFOpenAIGPTMainLayer
(
config
,
name
=
"transformer"
)
self
.
multiple_choice_head
=
TFSequenceSummary
(
...
...
src/transformers/modeling_tf_roberta.py
View file @
dc17f2a1
...
...
@@ -42,7 +42,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
"""
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFRobertaEmbeddings
,
self
).
__init__
(
config
,
**
kwargs
)
super
().
__init__
(
config
,
**
kwargs
)
self
.
padding_idx
=
1
def
create_position_ids_from_input_ids
(
self
,
x
):
...
...
@@ -78,9 +78,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
else
:
position_ids
=
self
.
create_position_ids_from_inputs_embeds
(
inputs_embeds
)
return
super
(
TFRobertaEmbeddings
,
self
).
_embedding
(
[
input_ids
,
position_ids
,
token_type_ids
,
inputs_embeds
],
training
=
training
)
return
super
().
_embedding
([
input_ids
,
position_ids
,
token_type_ids
,
inputs_embeds
],
training
=
training
)
class
TFRobertaMainLayer
(
TFBertMainLayer
):
...
...
@@ -89,7 +87,7 @@ class TFRobertaMainLayer(TFBertMainLayer):
"""
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFRobertaMainLayer
,
self
).
__init__
(
config
,
**
kwargs
)
super
().
__init__
(
config
,
**
kwargs
)
self
.
embeddings
=
TFRobertaEmbeddings
(
config
,
name
=
"embeddings"
)
def
get_input_embeddings
(
self
):
...
...
@@ -234,7 +232,7 @@ class TFRobertaModel(TFRobertaPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFRobertaModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
roberta
=
TFRobertaMainLayer
(
config
,
name
=
"roberta"
)
def
call
(
self
,
inputs
,
**
kwargs
):
...
...
@@ -246,7 +244,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
"""Roberta Head for masked language modeling."""
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFRobertaLMHead
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"dense"
...
...
@@ -260,7 +258,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
def
build
(
self
,
input_shape
):
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
super
(
TFRobertaLMHead
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
features
):
x
=
self
.
dense
(
features
)
...
...
@@ -305,7 +303,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFRobertaForMaskedLM
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
roberta
=
TFRobertaMainLayer
(
config
,
name
=
"roberta"
)
self
.
lm_head
=
TFRobertaLMHead
(
config
,
self
.
roberta
.
embeddings
,
name
=
"lm_head"
)
...
...
@@ -328,7 +326,7 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer):
"""Head for sentence-level classification tasks."""
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFRobertaClassificationHead
,
self
).
__init__
(
config
,
**
kwargs
)
super
().
__init__
(
config
,
**
kwargs
)
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
config
.
hidden_size
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
...
...
@@ -383,7 +381,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFRobertaForSequenceClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
roberta
=
TFRobertaMainLayer
(
config
,
name
=
"roberta"
)
...
...
@@ -433,7 +431,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFRobertaForTokenClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
roberta
=
TFRobertaMainLayer
(
config
,
name
=
"roberta"
)
...
...
src/transformers/modeling_tf_t5.py
View file @
dc17f2a1
...
...
@@ -50,13 +50,13 @@ class TFT5LayerNorm(tf.keras.layers.Layer):
""" Construct a layernorm module in the T5 style
No bias and no substraction of mean.
"""
super
(
TFT5LayerNorm
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
variance_epsilon
=
epsilon
def
build
(
self
,
input_shape
):
"""Build shared word embedding layer """
self
.
weight
=
self
.
add_weight
(
"weight"
,
shape
=
(
input_shape
[
-
1
],),
initializer
=
"ones"
)
super
(
TFT5LayerNorm
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
x
):
variance
=
tf
.
math
.
reduce_mean
(
tf
.
math
.
square
(
x
),
axis
=-
1
,
keepdims
=
True
)
...
...
@@ -66,7 +66,7 @@ class TFT5LayerNorm(tf.keras.layers.Layer):
class
TFT5DenseReluDense
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFT5DenseReluDense
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
wi
=
tf
.
keras
.
layers
.
Dense
(
config
.
d_ff
,
use_bias
=
False
,
name
=
"wi"
)
self
.
wo
=
tf
.
keras
.
layers
.
Dense
(
config
.
d_model
,
use_bias
=
False
,
name
=
"wo"
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout_rate
)
...
...
@@ -82,7 +82,7 @@ class TFT5DenseReluDense(tf.keras.layers.Layer):
class
TFT5LayerFF
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFT5LayerFF
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
DenseReluDense
=
TFT5DenseReluDense
(
config
,
name
=
"DenseReluDense"
)
self
.
layer_norm
=
TFT5LayerNorm
(
epsilon
=
config
.
layer_norm_epsilon
,
name
=
"layer_norm"
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout_rate
)
...
...
@@ -98,7 +98,7 @@ class TFT5Attention(tf.keras.layers.Layer):
NEW_ID
=
itertools
.
count
()
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
,
**
kwargs
):
super
(
TFT5Attention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
layer_id
=
next
(
TFT5Attention
.
NEW_ID
)
self
.
is_decoder
=
config
.
is_decoder
self
.
has_relative_attention_bias
=
has_relative_attention_bias
...
...
@@ -259,7 +259,7 @@ class TFT5Attention(tf.keras.layers.Layer):
class
TFT5LayerSelfAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
,
**
kwargs
):
super
(
TFT5LayerSelfAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
SelfAttention
=
TFT5Attention
(
config
,
has_relative_attention_bias
=
has_relative_attention_bias
,
name
=
"SelfAttention"
)
...
...
@@ -279,7 +279,7 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer):
class
TFT5LayerCrossAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
,
**
kwargs
):
super
(
TFT5LayerCrossAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
EncDecAttention
=
TFT5Attention
(
config
,
has_relative_attention_bias
=
has_relative_attention_bias
,
name
=
"EncDecAttention"
)
...
...
@@ -299,7 +299,7 @@ class TFT5LayerCrossAttention(tf.keras.layers.Layer):
class
TFT5Block
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
has_relative_attention_bias
=
False
,
**
kwargs
):
super
(
TFT5Block
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
is_decoder
=
config
.
is_decoder
self
.
layer
=
[]
self
.
layer
.
append
(
...
...
@@ -361,7 +361,7 @@ class TFT5Block(tf.keras.layers.Layer):
####################################################
class
TFT5MainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFT5MainLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
is_decoder
=
config
.
is_decoder
...
...
@@ -633,7 +633,7 @@ class TFT5Model(TFT5PreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFT5Model
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
shared
=
TFSharedEmbeddings
(
config
.
vocab_size
,
config
.
d_model
,
name
=
"shared"
)
encoder_config
=
copy
.
deepcopy
(
config
)
...
...
@@ -724,7 +724,7 @@ class TFT5WithLMHeadModel(TFT5PreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFT5WithLMHeadModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
model_dim
=
config
.
d_model
self
.
shared
=
TFSharedEmbeddings
(
config
.
vocab_size
,
config
.
d_model
,
name
=
"shared"
)
...
...
src/transformers/modeling_tf_transfo_xl.py
View file @
dc17f2a1
...
...
@@ -36,7 +36,7 @@ TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
class
TFPositionalEmbedding
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
demb
,
**
kwargs
):
super
(
TFPositionalEmbedding
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
inv_freq
=
1
/
(
10000
**
(
tf
.
range
(
0
,
demb
,
2.0
)
/
demb
))
...
...
@@ -52,7 +52,7 @@ class TFPositionalEmbedding(tf.keras.layers.Layer):
class
TFPositionwiseFF
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
d_model
,
d_inner
,
dropout
,
pre_lnorm
=
False
,
layer_norm_epsilon
=
1e-5
,
init_std
=
0.02
,
**
kwargs
):
super
(
TFPositionwiseFF
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
d_model
=
d_model
self
.
d_inner
=
d_inner
...
...
@@ -112,7 +112,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
init_std
=
0.02
,
**
kwargs
):
super
(
TFRelPartialLearnableMultiHeadAttn
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
output_attentions
self
.
n_head
=
n_head
...
...
@@ -155,7 +155,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
self
.
r_w_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"r_w_bias"
)
super
(
TFRelPartialLearnableMultiHeadAttn
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
_rel_shift
(
self
,
x
):
x_size
=
shape_list
(
x
)
...
...
@@ -267,7 +267,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
init_std
=
0.02
,
**
kwargs
):
super
(
TFRelPartialLearnableDecoderLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
dec_attn
=
TFRelPartialLearnableMultiHeadAttn
(
n_head
,
...
...
@@ -308,7 +308,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
class
TFAdaptiveEmbedding
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
n_token
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
init_std
=
0.02
,
sample_softmax
=
False
,
**
kwargs
):
super
(
TFAdaptiveEmbedding
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
n_token
=
n_token
self
.
d_embed
=
d_embed
...
...
@@ -350,7 +350,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
name
=
"emb_projs_._{}"
.
format
(
i
),
)
)
super
(
TFAdaptiveEmbedding
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
inp
):
if
self
.
div_val
==
1
:
...
...
@@ -380,7 +380,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
class
TFTransfoXLMainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFTransfoXLMainLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
...
...
@@ -455,7 +455,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
self
.
r_r_bias
=
self
.
add_weight
(
shape
=
(
self
.
n_head
,
self
.
d_head
),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"r_r_bias"
)
super
(
TFTransfoXLMainLayer
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
get_input_embeddings
(
self
):
return
self
.
word_emb
...
...
@@ -728,7 +728,7 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFTransfoXLModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFTransfoXLMainLayer
(
config
,
name
=
"transformer"
)
def
call
(
self
,
inputs
,
**
kwargs
):
...
...
@@ -774,7 +774,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
"""
def
__init__
(
self
,
config
):
super
(
TFTransfoXLLMHeadModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
transformer
=
TFTransfoXLMainLayer
(
config
,
name
=
"transformer"
)
self
.
sample_softmax
=
config
.
sample_softmax
# use sampled softmax
...
...
src/transformers/modeling_tf_transfo_xl_utilities.py
View file @
dc17f2a1
...
...
@@ -24,7 +24,7 @@ from .modeling_tf_utils import shape_list
class
TFAdaptiveSoftmaxMask
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
vocab_size
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
keep_order
=
False
,
**
kwargs
):
super
(
TFAdaptiveSoftmaxMask
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
vocab_size
self
.
d_embed
=
d_embed
...
...
@@ -98,7 +98,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
name
=
"out_layers_._{}_._bias"
.
format
(
i
),
)
self
.
out_layers
.
append
((
weight
,
bias
))
super
(
TFAdaptiveSoftmaxMask
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
@
staticmethod
def
_logit
(
x
,
W
,
b
,
proj
=
None
):
...
...
src/transformers/modeling_tf_utils.py
View file @
dc17f2a1
...
...
@@ -78,7 +78,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
return
{
"input_ids"
:
tf
.
constant
(
DUMMY_INPUTS
)}
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFPreTrainedModel
,
self
).
__init__
(
*
inputs
,
**
kwargs
)
super
().
__init__
(
*
inputs
,
**
kwargs
)
if
not
isinstance
(
config
,
PretrainedConfig
):
raise
ValueError
(
"Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
...
...
@@ -385,7 +385,7 @@ class TFConv1D(tf.keras.layers.Layer):
""" TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
Basically works like a Linear layer but the weights are transposed
"""
super
(
TFConv1D
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
nf
=
nf
self
.
nx
=
nx
self
.
initializer_range
=
initializer_range
...
...
@@ -412,7 +412,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
"""
def
__init__
(
self
,
vocab_size
,
hidden_size
,
initializer_range
=
None
,
**
kwargs
):
super
(
TFSharedEmbeddings
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
vocab_size
self
.
hidden_size
=
hidden_size
self
.
initializer_range
=
hidden_size
**
-
0.5
if
initializer_range
is
None
else
initializer_range
...
...
@@ -425,7 +425,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
self
.
weight
=
self
.
add_weight
(
"weight"
,
shape
=
[
self
.
vocab_size
,
self
.
hidden_size
],
initializer
=
get_initializer
(
self
.
initializer_range
)
)
super
(
TFSharedEmbeddings
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
inputs
,
mode
=
"embedding"
):
"""Get token embeddings of inputs.
...
...
@@ -485,7 +485,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
"""
def
__init__
(
self
,
config
,
initializer_range
=
0.02
,
**
kwargs
):
super
(
TFSequenceSummary
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
summary_type
=
config
.
summary_type
if
hasattr
(
config
,
"summary_use_proj"
)
else
"last"
if
self
.
summary_type
==
"attn"
:
...
...
src/transformers/modeling_tf_xlm.py
View file @
dc17f2a1
...
...
@@ -97,7 +97,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
NEW_ID
=
itertools
.
count
()
def
__init__
(
self
,
n_heads
,
dim
,
config
,
**
kwargs
):
super
(
TFMultiHeadAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
layer_id
=
next
(
TFMultiHeadAttention
.
NEW_ID
)
self
.
output_attentions
=
config
.
output_attentions
self
.
dim
=
dim
...
...
@@ -182,7 +182,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer):
class
TFTransformerFFN
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
in_dim
,
dim_hidden
,
out_dim
,
config
,
**
kwargs
):
super
(
TFTransformerFFN
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
lin1
=
tf
.
keras
.
layers
.
Dense
(
dim_hidden
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
"lin1"
)
self
.
lin2
=
tf
.
keras
.
layers
.
Dense
(
out_dim
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
"lin2"
)
self
.
act
=
tf
.
keras
.
layers
.
Activation
(
gelu
)
if
config
.
gelu_activation
else
tf
.
keras
.
activations
.
relu
...
...
@@ -198,7 +198,7 @@ class TFTransformerFFN(tf.keras.layers.Layer):
class
TFXLMMainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFXLMMainLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
...
...
@@ -608,7 +608,7 @@ class TFXLMModel(TFXLMPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLMModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
"transformer"
)
def
call
(
self
,
inputs
,
**
kwargs
):
...
...
@@ -622,7 +622,7 @@ class TFXLMPredLayer(tf.keras.layers.Layer):
"""
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFXLMPredLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
asm
=
config
.
asm
self
.
n_words
=
config
.
n_words
self
.
pad_index
=
config
.
pad_index
...
...
@@ -641,7 +641,7 @@ class TFXLMPredLayer(tf.keras.layers.Layer):
def
build
(
self
,
input_shape
):
# The output weights are the same as the input embeddings, but there is an output-only bias for each token.
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
n_words
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
super
(
TFXLMPredLayer
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
hidden_states
):
hidden_states
=
self
.
input_embeddings
(
hidden_states
,
mode
=
"linear"
)
...
...
@@ -682,7 +682,7 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLMWithLMHeadModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
"transformer"
)
self
.
pred_layer
=
TFXLMPredLayer
(
config
,
self
.
transformer
.
embeddings
,
name
=
"pred_layer_._proj"
)
...
...
@@ -733,7 +733,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLMForSequenceClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
"transformer"
)
...
...
@@ -784,7 +784,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLMForQuestionAnsweringSimple
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFXLMMainLayer
(
config
,
name
=
"transformer"
)
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
init_std
),
name
=
"qa_outputs"
...
...
src/transformers/modeling_tf_xlnet.py
View file @
dc17f2a1
...
...
@@ -57,7 +57,7 @@ ACT2FN = {
class
TFXLNetRelativeAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFXLNetRelativeAttention
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
if
config
.
d_model
%
config
.
n_head
!=
0
:
...
...
@@ -104,7 +104,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
self
.
seg_embed
=
self
.
add_weight
(
shape
=
(
2
,
self
.
n_head
,
self
.
d_head
),
initializer
=
initializer
,
trainable
=
True
,
name
=
"seg_embed"
)
super
(
TFXLNetRelativeAttention
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
prune_heads
(
self
,
heads
):
raise
NotImplementedError
...
...
@@ -280,7 +280,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
class
TFXLNetFeedForward
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFXLNetFeedForward
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_eps
,
name
=
"layer_norm"
)
self
.
layer_1
=
tf
.
keras
.
layers
.
Dense
(
config
.
d_inner
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"layer_1"
...
...
@@ -307,7 +307,7 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
class
TFXLNetLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFXLNetLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
rel_attn
=
TFXLNetRelativeAttention
(
config
,
name
=
"rel_attn"
)
self
.
ff
=
TFXLNetFeedForward
(
config
,
name
=
"ff"
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
dropout
)
...
...
@@ -326,7 +326,7 @@ class TFXLNetLayer(tf.keras.layers.Layer):
class
TFXLNetLMHead
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
input_embeddings
,
**
kwargs
):
super
(
TFXLNetLMHead
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
...
...
@@ -334,7 +334,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer):
def
build
(
self
,
input_shape
):
self
.
bias
=
self
.
add_weight
(
shape
=
(
self
.
vocab_size
,),
initializer
=
"zeros"
,
trainable
=
True
,
name
=
"bias"
)
super
(
TFXLNetLMHead
,
self
).
build
(
input_shape
)
super
().
build
(
input_shape
)
def
call
(
self
,
hidden_states
):
hidden_states
=
self
.
input_embeddings
(
hidden_states
,
mode
=
"linear"
)
...
...
@@ -344,7 +344,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer):
class
TFXLNetMainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFXLNetMainLayer
,
self
).
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_past
=
config
.
output_past
...
...
@@ -832,7 +832,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLNetModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
def
call
(
self
,
inputs
,
**
kwargs
):
...
...
@@ -885,7 +885,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLNetLMHeadModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
self
.
lm_loss
=
TFXLNetLMHead
(
config
,
self
.
transformer
.
word_embedding
,
name
=
"lm_loss"
)
...
...
@@ -940,7 +940,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLNetForSequenceClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
...
...
@@ -1001,7 +1001,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLNetForTokenClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
...
...
@@ -1058,7 +1058,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFXLNetForQuestionAnsweringSimple
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
super
().
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFXLNetMainLayer
(
config
,
name
=
"transformer"
)
self
.
qa_outputs
=
tf
.
keras
.
layers
.
Dense
(
config
.
num_labels
,
kernel_initializer
=
get_initializer
(
config
.
initializer_range
),
name
=
"qa_outputs"
...
...
@@ -1127,7 +1127,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
# """
# def __init__(self, config, *inputs, **kwargs):
# super(
TFXLNetForQuestionAnswering, self
).__init__(config, *inputs, **kwargs)
# super().__init__(config, *inputs, **kwargs)
# self.start_n_top = config.start_n_top
# self.end_n_top = config.end_n_top
...
...
src/transformers/modeling_transfo_xl.py
View file @
dc17f2a1
...
...
@@ -165,7 +165,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
class
PositionalEmbedding
(
nn
.
Module
):
def
__init__
(
self
,
demb
):
super
(
PositionalEmbedding
,
self
).
__init__
()
super
().
__init__
()
self
.
demb
=
demb
...
...
@@ -184,7 +184,7 @@ class PositionalEmbedding(nn.Module):
class
PositionwiseFF
(
nn
.
Module
):
def
__init__
(
self
,
d_model
,
d_inner
,
dropout
,
pre_lnorm
=
False
,
layer_norm_epsilon
=
1e-5
):
super
(
PositionwiseFF
,
self
).
__init__
()
super
().
__init__
()
self
.
d_model
=
d_model
self
.
d_inner
=
d_inner
...
...
@@ -236,7 +236,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
output_attentions
=
False
,
layer_norm_epsilon
=
1e-5
,
):
super
(
RelPartialLearnableMultiHeadAttn
,
self
).
__init__
()
super
().
__init__
()
self
.
output_attentions
=
output_attentions
self
.
n_head
=
n_head
...
...
@@ -368,7 +368,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module):
class
RelPartialLearnableDecoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
n_head
,
d_model
,
d_head
,
d_inner
,
dropout
,
layer_norm_epsilon
=
1e-5
,
**
kwargs
):
super
(
RelPartialLearnableDecoderLayer
,
self
).
__init__
()
super
().
__init__
()
self
.
dec_attn
=
RelPartialLearnableMultiHeadAttn
(
n_head
,
d_model
,
d_head
,
dropout
,
layer_norm_epsilon
=
layer_norm_epsilon
,
**
kwargs
...
...
@@ -389,7 +389,7 @@ class RelPartialLearnableDecoderLayer(nn.Module):
class
AdaptiveEmbedding
(
nn
.
Module
):
def
__init__
(
self
,
n_token
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
sample_softmax
=
False
):
super
(
AdaptiveEmbedding
,
self
).
__init__
()
super
().
__init__
()
self
.
n_token
=
n_token
self
.
d_embed
=
d_embed
...
...
@@ -587,7 +587,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
"""
def
__init__
(
self
,
config
):
super
(
TransfoXLModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
output_attentions
=
config
.
output_attentions
self
.
output_hidden_states
=
config
.
output_hidden_states
...
...
@@ -845,7 +845,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
"""
def
__init__
(
self
,
config
):
super
(
TransfoXLLMHeadModel
,
self
).
__init__
(
config
)
super
().
__init__
(
config
)
self
.
transformer
=
TransfoXLModel
(
config
)
self
.
sample_softmax
=
config
.
sample_softmax
# use sampled softmax
...
...
src/transformers/modeling_transfo_xl_utilities.py
View file @
dc17f2a1
...
...
@@ -29,7 +29,7 @@ import torch.nn.functional as F
class
ProjectedAdaptiveLogSoftmax
(
nn
.
Module
):
def
__init__
(
self
,
n_token
,
d_embed
,
d_proj
,
cutoffs
,
div_val
=
1
,
keep_order
=
False
):
super
(
ProjectedAdaptiveLogSoftmax
,
self
).
__init__
()
super
().
__init__
()
self
.
n_token
=
n_token
self
.
d_embed
=
d_embed
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment