Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
34f28b2a
Commit
34f28b2a
authored
Sep 08, 2019
by
thomwolf
Browse files
WIP GPT2
parent
ad88563b
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
292 additions
and
76 deletions
+292
-76
pytorch_transformers/modeling_tf_bert.py
pytorch_transformers/modeling_tf_bert.py
+17
-17
pytorch_transformers/modeling_tf_gpt2.py
pytorch_transformers/modeling_tf_gpt2.py
+272
-56
pytorch_transformers/modeling_tf_utils.py
pytorch_transformers/modeling_tf_utils.py
+3
-3
No files found.
pytorch_transformers/modeling_tf_bert.py
View file @
34f28b2a
...
@@ -684,13 +684,13 @@ class TFBertModel(TFBertPreTrainedModel):
...
@@ -684,13 +684,13 @@ class TFBertModel(TFBertPreTrainedModel):
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')
input_ids = tf.
tensor
(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
input_ids = tf.
constant
(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertModel
,
self
).
__init__
(
config
)
super
(
TFBertModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
'bert'
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
'bert'
)
@
tf
.
function
@
tf
.
function
...
@@ -739,8 +739,8 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
...
@@ -739,8 +739,8 @@ class TFBertForPreTraining(TFBertPreTrainedModel):
prediction_scores, seq_relationship_scores = outputs[:2]
prediction_scores, seq_relationship_scores = outputs[:2]
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForPreTraining
,
self
).
__init__
(
config
)
super
(
TFBertForPreTraining
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
'bert'
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
'bert'
)
self
.
cls_nsp
=
TFBertNSPHead
(
config
,
name
=
'cls_nsp'
)
self
.
cls_nsp
=
TFBertNSPHead
(
config
,
name
=
'cls_nsp'
)
...
@@ -790,8 +790,8 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
...
@@ -790,8 +790,8 @@ class TFBertForMaskedLM(TFBertPreTrainedModel):
loss, prediction_scores = outputs[:2]
loss, prediction_scores = outputs[:2]
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForMaskedLM
,
self
).
__init__
(
config
)
super
(
TFBertForMaskedLM
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
'bert'
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
'bert'
)
...
@@ -839,8 +839,8 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
...
@@ -839,8 +839,8 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
seq_relationship_scores = outputs[0]
seq_relationship_scores = outputs[0]
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForNextSentencePrediction
,
self
).
__init__
(
config
)
super
(
TFBertForNextSentencePrediction
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
'bert'
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
'bert'
)
self
.
cls_nsp
=
TFBertNSPHead
(
config
,
name
=
'cls_nsp'
)
self
.
cls_nsp
=
TFBertNSPHead
(
config
,
name
=
'cls_nsp'
)
...
@@ -891,8 +891,8 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
...
@@ -891,8 +891,8 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
loss, logits = outputs[:2]
loss, logits = outputs[:2]
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForSequenceClassification
,
self
).
__init__
(
config
)
super
(
TFBertForSequenceClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
'bert'
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
'bert'
)
...
@@ -984,8 +984,8 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
...
@@ -984,8 +984,8 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
loss, classification_scores = outputs[:2]
loss, classification_scores = outputs[:2]
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForMultipleChoice
,
self
).
__init__
(
config
)
super
(
TFBertForMultipleChoice
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
'bert'
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
'bert'
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
hidden_dropout_prob
)
...
@@ -1066,8 +1066,8 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
...
@@ -1066,8 +1066,8 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
loss, scores = outputs[:2]
loss, scores = outputs[:2]
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForTokenClassification
,
self
).
__init__
(
config
)
super
(
TFBertForTokenClassification
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
'bert'
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
'bert'
)
...
@@ -1128,8 +1128,8 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel):
...
@@ -1128,8 +1128,8 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel):
loss, start_scores, end_scores = outputs[:2]
loss, start_scores, end_scores = outputs[:2]
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFBertForQuestionAnswering
,
self
).
__init__
(
config
)
super
(
TFBertForQuestionAnswering
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
num_labels
=
config
.
num_labels
self
.
num_labels
=
config
.
num_labels
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
'bert'
)
self
.
bert
=
TFBertMainLayer
(
config
,
name
=
'bert'
)
...
...
pytorch_transformers/modeling_tf_gpt2.py
View file @
34f28b2a
...
@@ -128,8 +128,8 @@ class TFAttention(tf.keras.layers.Layer):
...
@@ -128,8 +128,8 @@ class TFAttention(tf.keras.layers.Layer):
self
.
split_size
=
n_state
self
.
split_size
=
n_state
self
.
scale
=
scale
self
.
scale
=
scale
self
.
c_attn
=
TFConv1D
(
n_state
*
3
,
nx
)
self
.
c_attn
=
TFConv1D
(
n_state
*
3
,
nx
,
name
=
'c_attn'
)
self
.
c_proj
=
TFConv1D
(
n_state
,
nx
)
self
.
c_proj
=
TFConv1D
(
n_state
,
nx
,
name
=
'c_proj'
)
self
.
attn_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
attn_pdrop
)
self
.
attn_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
attn_pdrop
)
self
.
resid_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
resid_pdrop
)
self
.
resid_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
resid_pdrop
)
self
.
pruned_heads
=
set
()
self
.
pruned_heads
=
set
()
...
@@ -139,7 +139,7 @@ class TFAttention(tf.keras.layers.Layer):
...
@@ -139,7 +139,7 @@ class TFAttention(tf.keras.layers.Layer):
@
staticmethod
@
staticmethod
@
tf
.
function
@
tf
.
function
def
attention_mask
(
nd
,
ns
,
*
,
dtype
):
def
attention_mask
(
nd
,
ns
,
dtype
):
"""1's in the lower triangle, counting from the lower right corner.
"""1's in the lower triangle, counting from the lower right corner.
Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
"""
"""
...
@@ -164,7 +164,8 @@ class TFAttention(tf.keras.layers.Layer):
...
@@ -164,7 +164,8 @@ class TFAttention(tf.keras.layers.Layer):
w
=
w
*
b
-
1e4
*
(
1
-
b
)
w
=
w
*
b
-
1e4
*
(
1
-
b
)
w
=
tf
.
nn
.
softmax
(
w
)
w
=
tf
.
nn
.
softmax
(
w
)
w
=
self
.
attn_dropout
(
w
,
training
=
training
)
if
training
:
w
=
self
.
attn_dropout
(
w
)
# Mask heads if we want to
# Mask heads if we want to
if
head_mask
is
not
None
:
if
head_mask
is
not
None
:
...
@@ -204,54 +205,238 @@ class TFAttention(tf.keras.layers.Layer):
...
@@ -204,54 +205,238 @@ class TFAttention(tf.keras.layers.Layer):
value
=
tf
.
concat
([
past_value
,
value
],
axis
=-
2
)
value
=
tf
.
concat
([
past_value
,
value
],
axis
=-
2
)
present
=
tf
.
stack
([
key
,
value
],
axis
=
1
)
present
=
tf
.
stack
([
key
,
value
],
axis
=
1
)
attn_outputs
=
self
.
_attn
(
query
,
key
,
value
,
head_mask
)
attn_outputs
=
self
.
_attn
(
query
,
key
,
value
,
head_mask
,
training
=
training
)
a
=
attn_outputs
[
0
]
a
=
attn_outputs
[
0
]
a
=
self
.
merge_heads
(
a
)
a
=
self
.
merge_heads
(
a
)
a
=
self
.
c_proj
(
a
)
a
=
self
.
c_proj
(
a
)
a
=
self
.
resid_dropout
(
a
,
training
=
training
)
if
training
:
a
=
self
.
resid_dropout
(
a
)
outputs
=
[
a
,
present
]
+
attn_outputs
[
1
:]
outputs
=
[
a
,
present
]
+
attn_outputs
[
1
:]
return
outputs
# a, present, (attentions)
return
outputs
# a, present, (attentions)
class
MLP
(
nn
.
Module
):
class
TF
MLP
(
nn
.
Module
):
def
__init__
(
self
,
n_state
,
config
):
# in MLP: n_state=3072 (4 * n_embd)
def
__init__
(
self
,
n_state
,
config
,
**
kwargs
):
super
(
MLP
,
self
).
__init__
()
super
(
TF
MLP
,
self
).
__init__
(
**
kwargs
)
nx
=
config
.
n_embd
nx
=
config
.
n_embd
self
.
c_fc
=
TFConv1D
(
n_state
,
nx
)
self
.
c_fc
=
TFConv1D
(
n_state
,
nx
,
name
=
'c_fc'
)
self
.
c_proj
=
TFConv1D
(
nx
,
n_state
)
self
.
c_proj
=
TFConv1D
(
nx
,
n_state
,
name
=
'c_proj'
)
self
.
act
=
gelu
self
.
act
=
gelu
self
.
dropout
=
nn
.
Dropout
(
config
.
resid_pdrop
)
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
resid_pdrop
)
def
forward
(
self
,
x
):
@
tf
.
function
def
call
(
self
,
x
,
training
=
False
):
h
=
self
.
act
(
self
.
c_fc
(
x
))
h
=
self
.
act
(
self
.
c_fc
(
x
))
h2
=
self
.
c_proj
(
h
)
h2
=
self
.
c_proj
(
h
)
return
self
.
dropout
(
h2
)
if
training
:
h2
=
self
.
dropout
(
h2
)
return
h2
class
TFBlock
(
tf
.
keras
.
layers
.
Layer
):
class
TFBlock
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
n_ctx
,
config
,
scale
=
False
,
**
kwargs
):
def
__init__
(
self
,
n_ctx
,
config
,
scale
=
False
,
**
kwargs
):
super
(
TFBlock
,
self
).
__init__
(
**
kwargs
)
super
(
TFBlock
,
self
).
__init__
(
**
kwargs
)
nx
=
config
.
n_embd
nx
=
config
.
n_embd
self
.
ln_1
=
nn
.
LayerNorm
(
nx
,
eps
=
config
.
layer_norm_epsilon
)
self
.
ln_1
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_epsilon
,
name
=
'ln_1'
)
self
.
attn
=
Attention
(
nx
,
n_ctx
,
config
,
scale
)
self
.
attn
=
TF
Attention
(
nx
,
n_ctx
,
config
,
scale
,
name
=
'attn'
)
self
.
ln_2
=
nn
.
LayerNorm
(
nx
,
eps
=
config
.
layer_norm_epsilon
)
self
.
ln_2
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_epsilon
,
name
=
'ln_2'
)
self
.
mlp
=
MLP
(
4
*
nx
,
config
)
self
.
mlp
=
TF
MLP
(
4
*
nx
,
config
,
name
=
'mlp'
)
def
forward
(
self
,
x
,
layer_past
=
None
,
head_mask
=
None
):
@
tf
.
function
output_attn
=
self
.
attn
(
self
.
ln_1
(
x
),
layer_past
=
layer_past
,
head_mask
=
head_mask
)
def
call
(
self
,
x
,
layer_past
=
None
,
head_mask
=
None
,
training
=
False
):
output_attn
=
self
.
attn
(
self
.
ln_1
(
x
),
layer_past
=
layer_past
,
head_mask
=
head_mask
,
training
=
training
)
a
=
output_attn
[
0
]
# output_attn: a, present, (attentions)
a
=
output_attn
[
0
]
# output_attn: a, present, (attentions)
x
=
x
+
a
x
=
x
+
a
m
=
self
.
mlp
(
self
.
ln_2
(
x
))
m
=
self
.
mlp
(
self
.
ln_2
(
x
)
,
training
=
training
)
x
=
x
+
m
x
=
x
+
m
outputs
=
[
x
]
+
output_attn
[
1
:]
outputs
=
[
x
]
+
output_attn
[
1
:]
return
outputs
# x, present, (attentions)
return
outputs
# x, present, (attentions)
class
TFGPT2Embeddings
(
tf
.
keras
.
layers
.
Layer
):
"""Construct the embeddings from word, position and token_type embeddings.
"""
def
__init__
(
self
,
config
,
**
kwargs
):
super
(
TFGPT2Embeddings
,
self
).
__init__
(
**
kwargs
)
self
.
vocab_size
=
config
.
vocab_size
self
.
hidden_size
=
config
.
hidden_size
def
build
(
self
,
input_shape
):
"""Build shared word embedding layer
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
self
.
weight
=
self
.
add_weight
(
"weight"
,
shape
=
[
self
.
vocab_size
,
self
.
n_embed
],
initializer
=
tf
.
random_normal_initializer
(
mean
=
0.
,
stddev
=
self
.
n_embed
**-
0.5
))
super
(
TFBertEmbeddings
,
self
).
build
(
input_shape
)
@
tf
.
function
def
call
(
self
,
inputs
,
mode
=
"embedding"
,
training
=
False
):
"""Get token embeddings of inputs.
Args:
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
mode: string, a valid value is one of "embedding" and "linear".
Returns:
outputs: (1) If mode == "embedding", output embedding tensor, float32 with
shape [batch_size, length, embedding_size]; (2) mode == "linear", output
linear tensor, float32 with shape [batch_size, length, vocab_size].
Raises:
ValueError: if mode is not valid.
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
if
mode
==
"embedding"
:
return
self
.
_embedding
(
inputs
,
training
=
training
)
elif
mode
==
"linear"
:
return
self
.
_linear
(
inputs
)
else
:
raise
ValueError
(
"mode {} is not valid."
.
format
(
mode
))
def
_embedding
(
self
,
input_ids
):
"""Applies embedding based on inputs tensor."""
return
tf
.
gather
(
self
.
weight
,
input_ids
)
def
_linear
(
self
,
inputs
):
"""Computes logits by running inputs through a linear layer.
Args:
inputs: A float32 tensor with shape [batch_size, length, hidden_size]
Returns:
float32 tensor with shape [batch_size, length, vocab_size].
"""
batch_size
=
tf
.
shape
(
inputs
)[
0
]
length
=
tf
.
shape
(
inputs
)[
1
]
x
=
tf
.
reshape
(
inputs
,
[
-
1
,
self
.
n_embed
])
logits
=
tf
.
matmul
(
x
,
self
.
weight
,
transpose_b
=
True
)
return
tf
.
reshape
(
logits
,
[
batch_size
,
length
,
self
.
vocab_size
])
class
TFGPT2MainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFGPT2MainLayer
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_attentions
=
config
.
output_attentions
self
.
vocab_size
=
config
.
vocab_size
self
.
n_embd
=
config
.
n_embd
self
.
wte
=
TFGPT2Embeddings
(
config
,
name
=
'wte'
)
self
.
wpe
=
tf
.
keras
.
layers
.
Embedding
(
config
.
n_positions
,
config
.
n_embd
,
name
=
'wpe'
)
self
.
drop
=
tf
.
keras
.
layers
.
Dropout
(
config
.
embd_pdrop
)
self
.
h
=
[
TFBlock
(
config
.
n_ctx
,
config
,
scale
=
Truename
=
'h_{}'
.
format
(
i
))
for
i
in
range
(
config
.
n_layer
)]
self
.
ln_f
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_epsilon
,
name
=
'ln_f'
)
def
_resize_token_embeddings
(
self
,
new_num_tokens
):
raise
NotImplementedError
def
_prune_heads
(
self
,
heads_to_prune
):
""" Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
"""
raise
NotImplementedError
@
tf
.
function
def
call
(
self
,
inputs
,
training
=
False
):
input_ids
,
position_ids
=
None
,
token_type_ids
=
None
,
past
=
None
,
head_mask
=
None
):
if
not
isinstance
(
inputs
,
(
dict
,
tuple
,
list
)):
input_ids
=
inputs
attention_mask
,
head_mask
,
position_ids
,
token_type_ids
=
None
,
None
,
None
,
None
elif
isinstance
(
inputs
,
(
tuple
,
list
)):
input_ids
=
inputs
[
0
]
attention_mask
=
inputs
[
1
]
if
len
(
inputs
)
>
1
else
None
token_type_ids
=
inputs
[
2
]
if
len
(
inputs
)
>
2
else
None
position_ids
=
inputs
[
3
]
if
len
(
inputs
)
>
3
else
None
head_mask
=
inputs
[
4
]
if
len
(
inputs
)
>
4
else
None
assert
len
(
inputs
)
<=
5
,
"Too many inputs."
else
:
input_ids
=
inputs
.
get
(
'input_ids'
)
attention_mask
=
inputs
.
get
(
'attention_mask'
,
None
)
token_type_ids
=
inputs
.
get
(
'token_type_ids'
,
None
)
position_ids
=
inputs
.
get
(
'position_ids'
,
None
)
head_mask
=
inputs
.
get
(
'head_mask'
,
None
)
assert
len
(
inputs
)
<=
5
,
"Too many inputs."
class
GPT2PreTrainedModel
(
PreTrainedModel
):
if
past
is
None
:
past_length
=
0
past
=
[
None
]
*
len
(
self
.
h
)
else
:
past_length
=
past
[
0
][
0
].
size
(
-
2
)
if
position_ids
is
None
:
position_ids
=
torch
.
arange
(
past_length
,
input_ids
.
size
(
-
1
)
+
past_length
,
dtype
=
torch
.
long
,
device
=
input_ids
.
device
)
position_ids
=
position_ids
.
unsqueeze
(
0
).
expand_as
(
input_ids
)
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# head_mask has shape n_layer x batch x n_heads x N x N
if
head_mask
is
not
None
:
if
head_mask
.
dim
()
==
1
:
head_mask
=
head_mask
.
unsqueeze
(
0
).
unsqueeze
(
0
).
unsqueeze
(
-
1
).
unsqueeze
(
-
1
)
head_mask
=
head_mask
.
expand
(
self
.
config
.
n_layer
,
-
1
,
-
1
,
-
1
,
-
1
)
elif
head_mask
.
dim
()
==
2
:
head_mask
=
head_mask
.
unsqueeze
(
1
).
unsqueeze
(
-
1
).
unsqueeze
(
-
1
)
# We can specify head_mask for each layer
head_mask
=
head_mask
.
to
(
dtype
=
next
(
self
.
parameters
()).
dtype
)
# switch to fload if need + fp16 compatibility
else
:
head_mask
=
[
None
]
*
self
.
config
.
n_layer
input_shape
=
input_ids
.
size
()
input_ids
=
input_ids
.
view
(
-
1
,
input_ids
.
size
(
-
1
))
position_ids
=
position_ids
.
view
(
-
1
,
position_ids
.
size
(
-
1
))
inputs_embeds
=
self
.
wte
(
input_ids
)
position_embeds
=
self
.
wpe
(
position_ids
)
if
token_type_ids
is
not
None
:
token_type_ids
=
token_type_ids
.
view
(
-
1
,
token_type_ids
.
size
(
-
1
))
token_type_embeds
=
self
.
wte
(
token_type_ids
)
else
:
token_type_embeds
=
0
hidden_states
=
inputs_embeds
+
position_embeds
+
token_type_embeds
hidden_states
=
self
.
drop
(
hidden_states
)
output_shape
=
input_shape
+
(
hidden_states
.
size
(
-
1
),)
presents
=
()
all_attentions
=
[]
all_hidden_states
=
()
for
i
,
(
block
,
layer_past
)
in
enumerate
(
zip
(
self
.
h
,
past
)):
if
self
.
output_hidden_states
:
all_hidden_states
=
all_hidden_states
+
(
hidden_states
.
view
(
*
output_shape
),)
outputs
=
block
(
hidden_states
,
layer_past
,
head_mask
[
i
])
hidden_states
,
present
=
outputs
[:
2
]
presents
=
presents
+
(
present
,)
if
self
.
output_attentions
:
all_attentions
.
append
(
outputs
[
2
])
hidden_states
=
self
.
ln_f
(
hidden_states
)
hidden_states
=
hidden_states
.
view
(
*
output_shape
)
# Add last hidden state
if
self
.
output_hidden_states
:
all_hidden_states
=
all_hidden_states
+
(
hidden_states
,)
outputs
=
(
hidden_states
,
presents
)
if
self
.
output_hidden_states
:
outputs
=
outputs
+
(
all_hidden_states
,)
if
self
.
output_attentions
:
# let the number of heads free (-1) so we can extract attention even after head pruning
attention_output_shape
=
input_shape
[:
-
1
]
+
(
-
1
,)
+
all_attentions
[
0
].
shape
[
-
2
:]
all_attentions
=
tuple
(
t
.
view
(
*
attention_output_shape
)
for
t
in
all_attentions
)
outputs
=
outputs
+
(
all_attentions
,)
return
outputs
# last hidden state, presents, (all hidden_states), (attentions)
class
TFGPT2PreTrainedModel
(
TFPreTrainedModel
):
""" An abstract class to handle weights initialization and
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
a simple interface for dowloading and loading pretrained models.
"""
"""
...
@@ -260,22 +445,6 @@ class GPT2PreTrainedModel(PreTrainedModel):
...
@@ -260,22 +445,6 @@ class GPT2PreTrainedModel(PreTrainedModel):
load_tf_weights
=
load_tf_weights_in_gpt2
load_tf_weights
=
load_tf_weights_in_gpt2
base_model_prefix
=
"transformer"
base_model_prefix
=
"transformer"
def
__init__
(
self
,
*
inputs
,
**
kwargs
):
super
(
GPT2PreTrainedModel
,
self
).
__init__
(
*
inputs
,
**
kwargs
)
def
_init_weights
(
self
,
module
):
""" Initialize the weights.
"""
if
isinstance
(
module
,
(
nn
.
Linear
,
nn
.
Embedding
,
Conv1D
)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module
.
weight
.
data
.
normal_
(
mean
=
0.0
,
std
=
self
.
config
.
initializer_range
)
if
isinstance
(
module
,
(
nn
.
Linear
,
Conv1D
))
and
module
.
bias
is
not
None
:
module
.
bias
.
data
.
zero_
()
elif
isinstance
(
module
,
nn
.
LayerNorm
):
module
.
bias
.
data
.
zero_
()
module
.
weight
.
data
.
fill_
(
1.0
)
GPT2_START_DOCSTRING
=
r
""" OpenAI GPT-2 model was proposed in
GPT2_START_DOCSTRING
=
r
""" OpenAI GPT-2 model was proposed in
`Language Models are Unsupervised Multitask Learners`_
`Language Models are Unsupervised Multitask Learners`_
...
@@ -283,14 +452,26 @@ GPT2_START_DOCSTRING = r""" OpenAI GPT-2 model was proposed in
...
@@ -283,14 +452,26 @@ GPT2_START_DOCSTRING = r""" OpenAI GPT-2 model was proposed in
It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
corpus of ~40 GB of text data.
corpus of ~40 GB of text data.
This model is a
PyTorch `torch.nn
.Mod
ul
e`_ sub-class. Use it as a regular
PyTorch
Mod
ul
e and
This model is a
tf.keras.Model `tf.keras
.Mode
l
`_ sub-class. Use it as a regular
TF 2.0 Keras
Mode
l
and
refer to the
PyTorch
documentation for all matter related to general usage and behavior.
refer to the
TF 2.0
documentation for all matter related to general usage and behavior.
.. _`Language Models are Unsupervised Multitask Learners`:
.. _`Language Models are Unsupervised Multitask Learners`:
https://openai.com/blog/better-language-models/
https://openai.com/blog/better-language-models/
.. _`torch.nn.Module`:
.. _`tf.keras.Model`:
https://pytorch.org/docs/stable/nn.html#module
https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
Important note on the model inputs:
The inputs of the TF 2.0 models are slightly different from the PyTorch ones since
TF 2.0 Keras doesn't accept named arguments with defaults values for input Tensor.
More precisely, input Tensors are gathered in the first arguments of the model call function: `model(inputs)`.
There are three possibilities to gather and feed the inputs to the model:
- a single Tensor with input_ids only and nothing else: `model(inputs_ids)
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associaed to the input names given in the docstring:
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
Parameters:
Parameters:
config (:class:`~pytorch_transformers.GPT2Config`): Model configuration class with all the parameters of the model.
config (:class:`~pytorch_transformers.GPT2Config`): Model configuration class with all the parameters of the model.
...
@@ -325,7 +506,7 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
...
@@ -325,7 +506,7 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
@
add_start_docstrings
(
"The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top."
,
@
add_start_docstrings
(
"The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top."
,
GPT2_START_DOCSTRING
,
GPT2_INPUTS_DOCSTRING
)
GPT2_START_DOCSTRING
,
GPT2_INPUTS_DOCSTRING
)
class
GPT2Model
(
GPT2PreTrainedModel
):
class
TF
GPT2Model
(
TF
GPT2PreTrainedModel
):
r
"""
r
"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
...
@@ -345,37 +526,72 @@ class GPT2Model(GPT2PreTrainedModel):
...
@@ -345,37 +526,72 @@ class GPT2Model(GPT2PreTrainedModel):
Examples::
Examples::
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
model =
TF
GPT2Model.from_pretrained('gpt2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
"""
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
GPT2Model
,
self
).
__init__
(
config
)
super
(
TF
GPT2Model
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_attentions
=
config
.
output_attentions
self
.
output_attentions
=
config
.
output_attentions
self
.
vocab_size
=
config
.
vocab_size
self
.
n_embd
=
config
.
n_embd
self
.
wte
=
nn
.
Embedding
(
config
.
vocab_size
,
config
.
n_embd
)
self
.
wpe
=
tf
.
keras
.
layers
.
Embedding
(
config
.
n_positions
,
config
.
n_embd
,
name
=
'wpe'
)
self
.
wpe
=
nn
.
Embedding
(
config
.
n_positions
,
config
.
n_embd
)
self
.
drop
=
tf
.
keras
.
layers
.
Dropout
(
config
.
embd_pdrop
)
self
.
drop
=
nn
.
Dropout
(
config
.
embd_pdrop
)
self
.
h
=
[
TFBlock
(
config
.
n_ctx
,
config
,
scale
=
Truename
=
'h_{}'
.
format
(
i
))
for
i
in
range
(
config
.
n_layer
)]
self
.
h
=
nn
.
ModuleList
([
Block
(
config
.
n_ctx
,
config
,
scale
=
True
)
for
_
in
range
(
config
.
n_layer
)])
self
.
ln_f
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_epsilon
,
name
=
'ln_f'
)
self
.
ln_f
=
nn
.
LayerNorm
(
config
.
n_embd
,
eps
=
config
.
layer_norm_epsilon
)
self
.
init_weights
()
self
.
init_weights
()
def
build
(
self
,
input_shape
):
"""Build shared word embedding layer
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
with
tf
.
name_scope
(
"wte"
):
# Create and initialize weights. The random normal initializer was chosen
# arbitrarily, and works well.
self
.
wte
=
self
.
add_weight
(
"weight"
,
shape
=
[
self
.
vocab_size
,
self
.
n_embed
],
initializer
=
tf
.
random_normal_initializer
(
mean
=
0.
,
stddev
=
self
.
n_embed
**-
0.5
))
super
(
TFGPT2Model
,
self
).
build
(
input_shape
)
def
_resize_token_embeddings
(
self
,
new_num_tokens
):
def
_resize_token_embeddings
(
self
,
new_num_tokens
):
self
.
wte
=
self
.
_get_resized_embeddings
(
self
.
wte
,
new_num_tokens
)
raise
NotImplementedError
return
self
.
wte
def
_prune_heads
(
self
,
heads_to_prune
):
def
_prune_heads
(
self
,
heads_to_prune
):
""" Prunes heads of the model.
""" Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
"""
"""
for
layer
,
heads
in
heads_to_prune
.
items
():
raise
NotImplementedError
self
.
h
[
layer
].
attn
.
prune_heads
(
heads
)
@
tf
.
function
def
call
(
self
,
inputs
,
training
=
False
):
input_ids
,
position_ids
=
None
,
token_type_ids
=
None
,
past
=
None
,
head_mask
=
None
):
if
not
isinstance
(
inputs
,
(
dict
,
tuple
,
list
)):
input_ids
=
inputs
attention_mask
,
head_mask
,
position_ids
,
token_type_ids
=
None
,
None
,
None
,
None
elif
isinstance
(
inputs
,
(
tuple
,
list
)):
input_ids
=
inputs
[
0
]
attention_mask
=
inputs
[
1
]
if
len
(
inputs
)
>
1
else
None
token_type_ids
=
inputs
[
2
]
if
len
(
inputs
)
>
2
else
None
position_ids
=
inputs
[
3
]
if
len
(
inputs
)
>
3
else
None
head_mask
=
inputs
[
4
]
if
len
(
inputs
)
>
4
else
None
assert
len
(
inputs
)
<=
5
,
"Too many inputs."
else
:
input_ids
=
inputs
.
get
(
'input_ids'
)
attention_mask
=
inputs
.
get
(
'attention_mask'
,
None
)
token_type_ids
=
inputs
.
get
(
'token_type_ids'
,
None
)
position_ids
=
inputs
.
get
(
'position_ids'
,
None
)
head_mask
=
inputs
.
get
(
'head_mask'
,
None
)
assert
len
(
inputs
)
<=
5
,
"Too many inputs."
def
forward
(
self
,
input_ids
,
position_ids
=
None
,
token_type_ids
=
None
,
past
=
None
,
head_mask
=
None
):
if
past
is
None
:
if
past
is
None
:
past_length
=
0
past_length
=
0
past
=
[
None
]
*
len
(
self
.
h
)
past
=
[
None
]
*
len
(
self
.
h
)
...
...
pytorch_transformers/modeling_tf_utils.py
View file @
34f28b2a
...
@@ -52,7 +52,7 @@ class TFPreTrainedModel(tf.keras.Model):
...
@@ -52,7 +52,7 @@ class TFPreTrainedModel(tf.keras.Model):
base_model_prefix
=
""
base_model_prefix
=
""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFPreTrainedModel
,
self
).
__init__
()
super
(
TFPreTrainedModel
,
self
).
__init__
(
*
inputs
,
**
kwargs
)
if
not
isinstance
(
config
,
PretrainedConfig
):
if
not
isinstance
(
config
,
PretrainedConfig
):
raise
ValueError
(
raise
ValueError
(
"Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
"Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
...
@@ -257,11 +257,11 @@ class TFPreTrainedModel(tf.keras.Model):
...
@@ -257,11 +257,11 @@ class TFPreTrainedModel(tf.keras.Model):
return
model
return
model
class
TFConv1D
(
tf
.
keras
.
layers
.
Layer
):
class
TFConv1D
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
nf
,
nx
):
def
__init__
(
self
,
nf
,
nx
,
*
inputs
,
**
kwargs
):
""" TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
""" TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
Basically works like a Linear layer but the weights are transposed
Basically works like a Linear layer but the weights are transposed
"""
"""
super
(
TFConv1D
,
self
).
__init__
()
super
(
TFConv1D
,
self
).
__init__
(
*
inputs
,
**
kwargs
)
self
.
nf
=
nf
self
.
nf
=
nf
self
.
nx
=
nx
self
.
nx
=
nx
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment