Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
3a527fa8
"examples/vscode:/vscode.git/clone" did not exist on "c51e533a5febe3fae2bb33b060f6b1f36a92e003"
Commit
3a527fa8
authored
Sep 18, 2019
by
thomwolf
Browse files
OpenAI GPT tests ok
parent
556442af
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
793 additions
and
10 deletions
+793
-10
pytorch_transformers/configuration_xlm.py
pytorch_transformers/configuration_xlm.py
+0
-3
pytorch_transformers/configuration_xlnet.py
pytorch_transformers/configuration_xlnet.py
+1
-3
pytorch_transformers/modeling_tf_gpt2.py
pytorch_transformers/modeling_tf_gpt2.py
+3
-4
pytorch_transformers/modeling_tf_openai.py
pytorch_transformers/modeling_tf_openai.py
+558
-0
pytorch_transformers/tests/modeling_tf_openai_gpt_test.py
pytorch_transformers/tests/modeling_tf_openai_gpt_test.py
+231
-0
No files found.
pytorch_transformers/configuration_xlm.py
View file @
3a527fa8
...
...
@@ -56,8 +56,6 @@ class XLMConfig(PretrainedConfig):
dropout: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
dropatt: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
...
...
@@ -66,7 +64,6 @@ class XLMConfig(PretrainedConfig):
layer_norm_eps: The epsilon used by LayerNorm.
dropout: float, dropout rate.
dropatt: float, dropout rate on attention probabilities.
init: str, the initialization scheme, either "normal" or "uniform".
init_range: float, initialize the parameters with a uniform distribution
in [-init_range, init_range]. Only effective when init="uniform".
...
...
pytorch_transformers/configuration_xlnet.py
View file @
3a527fa8
...
...
@@ -49,14 +49,11 @@ class XLNetConfig(PretrainedConfig):
dropout: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
dropatt: The dropout ratio for the attention
probabilities.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
layer_norm_eps: The epsilon used by LayerNorm.
dropout: float, dropout rate.
dropatt: float, dropout rate on attention probabilities.
init: str, the initialization scheme, either "normal" or "uniform".
init_range: float, initialize the parameters with a uniform distribution
in [-init_range, init_range]. Only effective when init="uniform".
...
...
@@ -80,6 +77,7 @@ class XLNetConfig(PretrainedConfig):
n_layer
=
24
,
n_head
=
16
,
d_inner
=
4096
,
max_position_embeddings
=
512
,
ff_activation
=
"gelu"
,
untie_r
=
True
,
attn_type
=
"bi"
,
...
...
pytorch_transformers/modeling_tf_gpt2.py
View file @
3a527fa8
...
...
@@ -249,7 +249,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
token_type_ids
=
inputs
.
get
(
'token_type_ids'
,
None
)
position_ids
=
inputs
.
get
(
'position_ids'
,
None
)
head_mask
=
inputs
.
get
(
'head_mask'
,
None
)
assert
len
(
inputs
)
<=
5
,
"Too many inputs."
assert
len
(
inputs
)
<=
6
,
"Too many inputs."
if
past
is
None
:
past_length
=
0
...
...
@@ -551,7 +551,6 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
self
.
transformer
=
TFGPT2MainLayer
(
config
,
name
=
'transformer'
)
self
.
multiple_choice_head
=
TFSequenceSummary
(
config
,
name
=
'multiple_choice_head'
)
def
call
(
self
,
inputs
,
training
=
False
):
if
not
isinstance
(
inputs
,
(
dict
,
tuple
,
list
)):
input_ids
=
inputs
...
...
@@ -573,7 +572,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
token_type_ids
=
inputs
.
get
(
'token_type_ids'
,
None
)
position_ids
=
inputs
.
get
(
'position_ids'
,
None
)
head_mask
=
inputs
.
get
(
'head_mask'
,
None
)
assert
len
(
inputs
)
<=
5
,
"Too many inputs."
assert
len
(
inputs
)
<=
7
,
"Too many inputs."
input_shapes
=
shape_list
(
input_ids
)
...
...
@@ -598,4 +597,4 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
outputs
=
(
lm_logits
,
mc_logits
)
+
transformer_outputs
[
1
:]
return
outputs
#
(lm loss), (mc loss),
lm logits, mc logits, presents, (all hidden_states), (attentions)
return
outputs
# lm logits, mc logits, presents, (all hidden_states), (attentions)
pytorch_transformers/modeling_tf_openai.py
0 → 100644
View file @
3a527fa8
# coding=utf-8
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TF 2.0 OpenAI GPT model."""
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
collections
import
json
import
logging
import
math
import
os
import
sys
from
io
import
open
import
numpy
as
np
import
tensorflow
as
tf
from
.modeling_tf_utils
import
(
TFPreTrainedModel
,
TFConv1D
,
TFSharedEmbeddings
,
TFSequenceSummary
,
shape_list
)
from
.configuration_openai
import
OpenAIGPTConfig
from
.file_utils
import
add_start_docstrings
from
.modeling_tf_pytorch_utils
import
load_pytorch_checkpoint_in_tf2_model
logger
=
logging
.
getLogger
(
__name__
)
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
=
{
"openai-gpt"
:
"https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-tf_model.h5"
}
def
load_openai_gpt_pt_weights_in_tf2
(
tf_model
,
pytorch_checkpoint_path
):
# build the network
inputs_list
=
[[
7
,
6
,
0
,
0
,
1
],
[
1
,
2
,
3
,
0
,
0
],
[
0
,
0
,
0
,
4
,
5
]]
tf_inputs
=
tf
.
constant
(
inputs_list
)
tfo
=
tf_model
(
tf_inputs
,
training
=
False
)
return
load_pytorch_checkpoint_in_tf2_model
(
tf_model
,
pytorch_checkpoint_path
,
tf_inputs
=
tf_inputs
)
def
gelu
(
x
):
"""Gaussian Error Linear Unit.
This is a smoother version of the RELU.
Original paper: https://arxiv.org/abs/1606.08415
Args:
x: float Tensor to perform activation.
Returns:
`x` with the GELU activation applied.
"""
cdf
=
0.5
*
(
1.0
+
tf
.
tanh
(
(
np
.
sqrt
(
2
/
np
.
pi
)
*
(
x
+
0.044715
*
tf
.
pow
(
x
,
3
)))))
return
x
*
cdf
def
swish
(
x
):
return
x
*
tf
.
math
.
sigmoid
(
x
)
ACT_FNS
=
{
"gelu"
:
tf
.
keras
.
layers
.
Activation
(
gelu
),
"relu"
:
tf
.
keras
.
activations
.
relu
,
"swish"
:
tf
.
keras
.
layers
.
Activation
(
swish
)}
class
TFAttention
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
nx
,
n_ctx
,
config
,
scale
=
False
,
**
kwargs
):
super
(
TFAttention
,
self
).
__init__
(
**
kwargs
)
self
.
output_attentions
=
config
.
output_attentions
n_state
=
nx
# in Attention: n_state=768 (nx=n_embd)
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
assert
n_state
%
config
.
n_head
==
0
self
.
n_ctx
=
n_ctx
self
.
n_head
=
config
.
n_head
self
.
split_size
=
n_state
self
.
scale
=
scale
self
.
c_attn
=
TFConv1D
(
n_state
*
3
,
nx
,
name
=
'c_attn'
)
self
.
c_proj
=
TFConv1D
(
n_state
,
nx
,
name
=
'c_proj'
)
self
.
attn_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
attn_pdrop
)
self
.
resid_dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
resid_pdrop
)
self
.
pruned_heads
=
set
()
def
prune_heads
(
self
,
heads
):
pass
@
staticmethod
def
causal_attention_mask
(
nd
,
ns
,
dtype
):
"""1's in the lower triangle, counting from the lower right corner.
Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
"""
i
=
tf
.
range
(
nd
)[:,
None
]
j
=
tf
.
range
(
ns
)
m
=
i
>=
j
-
ns
+
nd
return
tf
.
cast
(
m
,
dtype
)
def
_attn
(
self
,
inputs
,
training
=
False
):
q
,
k
,
v
,
attention_mask
,
head_mask
=
inputs
# q, k, v have shape [batch, heads, sequence, features]
w
=
tf
.
matmul
(
q
,
k
,
transpose_b
=
True
)
if
self
.
scale
:
dk
=
tf
.
cast
(
tf
.
shape
(
k
)[
-
1
],
tf
.
float32
)
# scale attention_scores
w
=
w
/
tf
.
math
.
sqrt
(
dk
)
# w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
_
,
_
,
nd
,
ns
=
shape_list
(
w
)
b
=
self
.
causal_attention_mask
(
nd
,
ns
,
dtype
=
w
.
dtype
)
b
=
tf
.
reshape
(
b
,
[
1
,
1
,
nd
,
ns
])
w
=
w
*
b
-
1e4
*
(
1
-
b
)
if
attention_mask
is
not
None
:
# Apply the attention mask
w
=
w
+
attention_mask
w
=
tf
.
nn
.
softmax
(
w
,
axis
=-
1
)
w
=
self
.
attn_dropout
(
w
,
training
=
training
)
# Mask heads if we want to
if
head_mask
is
not
None
:
w
=
w
*
head_mask
outputs
=
[
tf
.
matmul
(
w
,
v
)]
if
self
.
output_attentions
:
outputs
.
append
(
w
)
return
outputs
def
merge_heads
(
self
,
x
):
x
=
tf
.
transpose
(
x
,
[
0
,
2
,
1
,
3
])
x_shape
=
shape_list
(
x
)
new_x_shape
=
x_shape
[:
-
2
]
+
[
x_shape
[
-
2
]
*
x_shape
[
-
1
]]
return
tf
.
reshape
(
x
,
new_x_shape
)
def
split_heads
(
self
,
x
):
x_shape
=
shape_list
(
x
)
new_x_shape
=
x_shape
[:
-
1
]
+
[
self
.
n_head
,
x_shape
[
-
1
]
//
self
.
n_head
]
x
=
tf
.
reshape
(
x
,
new_x_shape
)
return
tf
.
transpose
(
x
,
(
0
,
2
,
1
,
3
))
# (batch, head, seq_length, head_features)
def
call
(
self
,
inputs
,
training
=
False
):
x
,
attention_mask
,
head_mask
=
inputs
x
=
self
.
c_attn
(
x
)
query
,
key
,
value
=
tf
.
split
(
x
,
3
,
axis
=
2
)
query
=
self
.
split_heads
(
query
)
key
=
self
.
split_heads
(
key
)
value
=
self
.
split_heads
(
value
)
attn_outputs
=
self
.
_attn
([
query
,
key
,
value
,
attention_mask
,
head_mask
],
training
=
training
)
a
=
attn_outputs
[
0
]
a
=
self
.
merge_heads
(
a
)
a
=
self
.
c_proj
(
a
)
a
=
self
.
resid_dropout
(
a
,
training
=
training
)
outputs
=
[
a
]
+
attn_outputs
[
1
:]
return
outputs
# a, (attentions)
class
TFMLP
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
n_state
,
config
,
**
kwargs
):
super
(
TFMLP
,
self
).
__init__
(
**
kwargs
)
nx
=
config
.
n_embd
self
.
c_fc
=
TFConv1D
(
n_state
,
nx
,
name
=
'c_fc'
)
self
.
c_proj
=
TFConv1D
(
nx
,
n_state
,
name
=
'c_proj'
)
self
.
act
=
gelu
self
.
dropout
=
tf
.
keras
.
layers
.
Dropout
(
config
.
resid_pdrop
)
def
call
(
self
,
x
,
training
=
False
):
h
=
self
.
act
(
self
.
c_fc
(
x
))
h2
=
self
.
c_proj
(
h
)
h2
=
self
.
dropout
(
h2
,
training
=
training
)
return
h2
class
TFBlock
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
n_ctx
,
config
,
scale
=
False
,
**
kwargs
):
super
(
TFBlock
,
self
).
__init__
(
**
kwargs
)
nx
=
config
.
n_embd
self
.
attn
=
TFAttention
(
nx
,
n_ctx
,
config
,
scale
,
name
=
'attn'
)
self
.
ln_1
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_epsilon
,
name
=
'ln_1'
)
self
.
mlp
=
TFMLP
(
4
*
nx
,
config
,
name
=
'mlp'
)
self
.
ln_2
=
tf
.
keras
.
layers
.
LayerNormalization
(
epsilon
=
config
.
layer_norm_epsilon
,
name
=
'ln_2'
)
def
call
(
self
,
inputs
,
training
=
False
):
x
,
attention_mask
,
head_mask
=
inputs
output_attn
=
self
.
attn
([
x
,
attention_mask
,
head_mask
],
training
=
training
)
a
=
output_attn
[
0
]
# output_attn: a, (attentions)
n
=
self
.
ln_1
(
x
+
a
)
m
=
self
.
mlp
(
n
,
training
=
training
)
h
=
self
.
ln_2
(
n
+
m
)
outputs
=
[
h
]
+
output_attn
[
1
:]
return
outputs
# x, (attentions)
class
TFOpenAIGPTMainLayer
(
tf
.
keras
.
layers
.
Layer
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFOpenAIGPTMainLayer
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
output_hidden_states
=
config
.
output_hidden_states
self
.
output_attentions
=
config
.
output_attentions
self
.
num_hidden_layers
=
config
.
n_layer
self
.
vocab_size
=
config
.
vocab_size
self
.
n_embd
=
config
.
n_embd
self
.
tokens_embed
=
TFSharedEmbeddings
(
config
.
vocab_size
,
config
.
n_embd
,
name
=
'tokens_embed'
)
self
.
positions_embed
=
tf
.
keras
.
layers
.
Embedding
(
config
.
n_positions
,
config
.
n_embd
,
name
=
'positions_embed'
)
self
.
drop
=
tf
.
keras
.
layers
.
Dropout
(
config
.
embd_pdrop
)
self
.
h
=
[
TFBlock
(
config
.
n_ctx
,
config
,
scale
=
True
,
name
=
'h_._{}'
.
format
(
i
))
for
i
in
range
(
config
.
n_layer
)]
def
_resize_token_embeddings
(
self
,
new_num_tokens
):
raise
NotImplementedError
def
_prune_heads
(
self
,
heads_to_prune
):
""" Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
"""
raise
NotImplementedError
def
call
(
self
,
inputs
,
training
=
False
):
if
not
isinstance
(
inputs
,
(
dict
,
tuple
,
list
)):
input_ids
=
inputs
attention_mask
,
token_type_ids
,
position_ids
,
head_mask
=
None
,
None
,
None
,
None
elif
isinstance
(
inputs
,
(
tuple
,
list
)):
input_ids
=
inputs
[
0
]
attention_mask
=
inputs
[
1
]
if
len
(
inputs
)
>
1
else
None
token_type_ids
=
inputs
[
2
]
if
len
(
inputs
)
>
2
else
None
position_ids
=
inputs
[
3
]
if
len
(
inputs
)
>
3
else
None
head_mask
=
inputs
[
4
]
if
len
(
inputs
)
>
4
else
None
assert
len
(
inputs
)
<=
5
,
"Too many inputs."
else
:
input_ids
=
inputs
.
get
(
'input_ids'
)
attention_mask
=
inputs
.
get
(
'attention_mask'
,
None
)
token_type_ids
=
inputs
.
get
(
'token_type_ids'
,
None
)
position_ids
=
inputs
.
get
(
'position_ids'
,
None
)
head_mask
=
inputs
.
get
(
'head_mask'
,
None
)
assert
len
(
inputs
)
<=
5
,
"Too many inputs."
if
position_ids
is
None
:
position_ids
=
tf
.
range
(
shape_list
(
input_ids
)[
-
1
],
dtype
=
tf
.
int32
)[
tf
.
newaxis
,
:]
if
attention_mask
is
not
None
:
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
attention_mask
=
attention_mask
[:,
tf
.
newaxis
,
tf
.
newaxis
,
:]
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
attention_mask
=
tf
.
cast
(
attention_mask
,
tf
.
float32
)
attention_mask
=
(
1.0
-
attention_mask
)
*
-
10000.0
else
:
attention_mask
=
None
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
if
not
head_mask
is
None
:
raise
NotImplementedError
else
:
head_mask
=
[
None
]
*
self
.
num_hidden_layers
# head_mask = tf.constant([0] * self.num_hidden_layers)
input_shape
=
shape_list
(
input_ids
)
input_ids
=
tf
.
reshape
(
input_ids
,
[
-
1
,
input_shape
[
-
1
]])
position_ids
=
tf
.
reshape
(
position_ids
,
[
-
1
,
shape_list
(
position_ids
)[
-
1
]])
inputs_embeds
=
self
.
tokens_embed
(
input_ids
,
mode
=
'embedding'
)
position_embeds
=
self
.
positions_embed
(
position_ids
)
if
token_type_ids
is
not
None
:
token_type_ids
=
tf
.
reshape
(
token_type_ids
,
[
-
1
,
shape_list
(
token_type_ids
)[
-
1
]])
token_type_embeds
=
self
.
tokens_embed
(
token_type_ids
,
mode
=
'embedding'
)
else
:
token_type_embeds
=
0
hidden_states
=
inputs_embeds
+
position_embeds
+
token_type_embeds
hidden_states
=
self
.
drop
(
hidden_states
,
training
=
training
)
output_shape
=
input_shape
+
[
shape_list
(
hidden_states
)[
-
1
]]
all_attentions
=
[]
all_hidden_states
=
()
for
i
,
block
in
enumerate
(
self
.
h
):
if
self
.
output_hidden_states
:
all_hidden_states
=
all_hidden_states
+
(
tf
.
reshape
(
hidden_states
,
output_shape
),)
outputs
=
block
([
hidden_states
,
attention_mask
,
head_mask
[
i
]],
training
=
training
)
hidden_states
=
outputs
[
0
]
if
self
.
output_attentions
:
all_attentions
.
append
(
outputs
[
1
])
hidden_states
=
tf
.
reshape
(
hidden_states
,
output_shape
)
# Add last hidden state
if
self
.
output_hidden_states
:
all_hidden_states
=
all_hidden_states
+
(
hidden_states
,)
outputs
=
(
hidden_states
,)
if
self
.
output_hidden_states
:
outputs
=
outputs
+
(
all_hidden_states
,)
if
self
.
output_attentions
:
# let the number of heads free (-1) so we can extract attention even after head pruning
attention_output_shape
=
input_shape
[:
-
1
]
+
[
-
1
]
+
shape_list
(
all_attentions
[
0
])[
-
2
:]
all_attentions
=
tuple
(
tf
.
reshape
(
t
,
attention_output_shape
)
for
t
in
all_attentions
)
outputs
=
outputs
+
(
all_attentions
,)
return
outputs
# last hidden state, (all hidden_states), (attentions)
class
TFOpenAIGPTPreTrainedModel
(
TFPreTrainedModel
):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
config_class
=
OpenAIGPTConfig
pretrained_model_archive_map
=
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
load_pt_weights
=
load_openai_gpt_pt_weights_in_tf2
base_model_prefix
=
"transformer"
OPENAI_GPT_START_DOCSTRING
=
r
""" OpenAI GPT model was proposed in
`Improving Language Understanding by Generative Pre-Training`_
by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
It's a causal (unidirectional) transformer pre-trained using language modeling on a large
corpus will long range dependencies, the Toronto Book Corpus.
This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
refer to the TF 2.0 documentation for all matter related to general usage and behavior.
.. _`Improving Language Understanding by Generative Pre-Training`:
https://openai.com/blog/language-unsupervised/
.. _`tf.keras.Model`:
https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
Important note on the model inputs:
The inputs of the TF 2.0 models are slightly different from the PyTorch ones since
TF 2.0 Keras doesn't accept named arguments with defaults values for input Tensor.
More precisely, input Tensors are gathered in the first arguments of the model call function: `model(inputs)`.
There are three possibilities to gather and feed the inputs to the model:
- a single Tensor with input_ids only and nothing else: `model(inputs_ids)
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associaed to the input names given in the docstring:
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
Parameters:
config (:class:`~pytorch_transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
"""
OPENAI_GPT_INPUTS_DOCSTRING
=
r
""" Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
the right rather than the left.
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices)
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@
add_start_docstrings
(
"The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top."
,
OPENAI_GPT_START_DOCSTRING
,
OPENAI_GPT_INPUTS_DOCSTRING
)
class
TFOpenAIGPTModel
(
TFOpenAIGPTPreTrainedModel
):
r
"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the last layer of the model.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = OpenAIGPTModel.from_pretrained('openai-gpt')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFOpenAIGPTModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFOpenAIGPTMainLayer
(
config
,
name
=
'transformer'
)
def
call
(
self
,
inputs
,
training
=
False
):
outputs
=
self
.
transformer
(
inputs
,
training
=
training
)
return
outputs
@
add_start_docstrings
(
"""OpenAI GPT Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """
,
OPENAI_GPT_START_DOCSTRING
,
OPENAI_GPT_INPUTS_DOCSTRING
)
class
TFOpenAIGPTLMHeadModel
(
TFOpenAIGPTPreTrainedModel
):
r
"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Language modeling loss.
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=input_ids)
loss, logits = outputs[:2]
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFOpenAIGPTLMHeadModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFOpenAIGPTMainLayer
(
config
,
name
=
'transformer'
)
def
call
(
self
,
inputs
,
training
=
False
):
transformer_outputs
=
self
.
transformer
(
inputs
,
training
=
training
)
hidden_states
=
transformer_outputs
[
0
]
lm_logits
=
self
.
transformer
.
tokens_embed
(
hidden_states
,
mode
=
"linear"
)
outputs
=
(
lm_logits
,)
+
transformer_outputs
[
1
:]
return
outputs
# lm_logits, (all hidden_states), (attentions)
@
add_start_docstrings
(
"""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
The language modeling head has its weights tied to the input embeddings,
the classification head takes as input the input of a specified classification token index in the input sequence).
"""
,
OPENAI_GPT_START_DOCSTRING
,
OPENAI_GPT_INPUTS_DOCSTRING
)
class
TFOpenAIGPTDoubleHeadsModel
(
TFOpenAIGPTPreTrainedModel
):
r
"""
**mc_token_ids**: (`optional`, default to index of the last token of the input) ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
Index of the classification token in each input sequence.
Selected in the range ``[0, input_ids.size(-1) - 1[``.
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**lm_loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Language modeling loss.
**mc_loss**: (`optional`, returned when ``multiple_choice_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Multiple choice classification loss.
**lm_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)``
Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!)
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
mc_token_ids = torch.tensor([input_ids.size(-1), input_ids.size(-1)]).unsqueeze(0) # Batch size 1
outputs = model(input_ids, mc_token_ids=mc_token_ids)
lm_prediction_scores, mc_prediction_scores = outputs[:2]
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
TFOpenAIGPTDoubleHeadsModel
,
self
).
__init__
(
config
,
*
inputs
,
**
kwargs
)
self
.
transformer
=
TFOpenAIGPTMainLayer
(
config
,
name
=
'transformer'
)
self
.
multiple_choice_head
=
TFSequenceSummary
(
config
,
name
=
'multiple_choice_head'
)
def
call
(
self
,
inputs
,
training
=
False
):
if
not
isinstance
(
inputs
,
(
dict
,
tuple
,
list
)):
input_ids
=
inputs
mc_token_ids
,
attention_mask
,
token_type_ids
,
position_ids
,
head_mask
=
None
,
None
,
None
,
None
elif
isinstance
(
inputs
,
(
tuple
,
list
)):
input_ids
=
inputs
[
0
]
mc_token_ids
=
inputs
[
1
]
if
len
(
inputs
)
>
1
else
None
attention_mask
=
inputs
[
2
]
if
len
(
inputs
)
>
2
else
None
token_type_ids
=
inputs
[
3
]
if
len
(
inputs
)
>
3
else
None
position_ids
=
inputs
[
4
]
if
len
(
inputs
)
>
4
else
None
head_mask
=
inputs
[
5
]
if
len
(
inputs
)
>
5
else
None
assert
len
(
inputs
)
<=
6
,
"Too many inputs."
else
:
input_ids
=
inputs
.
get
(
'input_ids'
)
mc_token_ids
=
inputs
.
get
(
'mc_token_ids'
,
None
)
attention_mask
=
inputs
.
get
(
'attention_mask'
,
None
)
token_type_ids
=
inputs
.
get
(
'token_type_ids'
,
None
)
position_ids
=
inputs
.
get
(
'position_ids'
,
None
)
head_mask
=
inputs
.
get
(
'head_mask'
,
None
)
assert
len
(
inputs
)
<=
6
,
"Too many inputs."
input_shapes
=
shape_list
(
input_ids
)
seq_length
=
input_shapes
[
-
1
]
flat_input_ids
=
tf
.
reshape
(
input_ids
,
(
-
1
,
seq_length
))
flat_attention_mask
=
tf
.
reshape
(
attention_mask
,
(
-
1
,
seq_length
))
if
attention_mask
is
not
None
else
None
flat_token_type_ids
=
tf
.
reshape
(
token_type_ids
,
(
-
1
,
seq_length
))
if
token_type_ids
is
not
None
else
None
flat_position_ids
=
tf
.
reshape
(
position_ids
,
(
-
1
,
seq_length
))
if
position_ids
is
not
None
else
None
flat_inputs
=
[
flat_input_ids
,
flat_attention_mask
,
flat_token_type_ids
,
flat_position_ids
,
head_mask
]
transformer_outputs
=
self
.
transformer
(
flat_inputs
,
training
=
training
)
hidden_states
=
transformer_outputs
[
0
]
hidden_states
=
tf
.
reshape
(
hidden_states
,
input_shapes
+
shape_list
(
hidden_states
)[
-
1
:])
lm_logits
=
self
.
transformer
.
tokens_embed
(
hidden_states
,
mode
=
"linear"
)
mc_logits
=
self
.
multiple_choice_head
([
hidden_states
,
mc_token_ids
],
training
=
training
)
mc_logits
=
tf
.
squeeze
(
mc_logits
,
axis
=-
1
)
outputs
=
(
lm_logits
,
mc_logits
)
+
transformer_outputs
[
1
:]
return
outputs
# lm logits, mc logits, (all hidden_states), (attentions)
pytorch_transformers/tests/modeling_tf_openai_gpt_test.py
0 → 100644
View file @
3a527fa8
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
unittest
import
shutil
import
pytest
import
sys
from
.modeling_tf_common_test
import
(
TFCommonTestCases
,
ids_tensor
)
from
.configuration_common_test
import
ConfigTester
from
pytorch_transformers
import
OpenAIGPTConfig
,
is_tf_available
if
is_tf_available
():
import
tensorflow
as
tf
from
pytorch_transformers.modeling_tf_openai
import
(
TFOpenAIGPTModel
,
TFOpenAIGPTLMHeadModel
,
TFOpenAIGPTDoubleHeadsModel
,
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
)
else
:
pytestmark
=
pytest
.
mark
.
skip
(
"Require TensorFlow"
)
class
TFOpenAIGPTModelTest
(
TFCommonTestCases
.
TFCommonModelTester
):
all_model_classes
=
(
TFOpenAIGPTModel
,
TFOpenAIGPTLMHeadModel
,
TFOpenAIGPTDoubleHeadsModel
)
if
is_tf_available
()
else
()
class
TFOpenAIGPTModelTester
(
object
):
def
__init__
(
self
,
parent
,
batch_size
=
13
,
seq_length
=
7
,
is_training
=
True
,
use_token_type_ids
=
True
,
use_input_mask
=
True
,
use_labels
=
True
,
use_mc_token_ids
=
True
,
vocab_size
=
99
,
hidden_size
=
32
,
num_hidden_layers
=
5
,
num_attention_heads
=
4
,
intermediate_size
=
37
,
hidden_act
=
"gelu"
,
hidden_dropout_prob
=
0.1
,
attention_probs_dropout_prob
=
0.1
,
max_position_embeddings
=
512
,
type_vocab_size
=
16
,
type_sequence_label_size
=
2
,
initializer_range
=
0.02
,
num_labels
=
3
,
num_choices
=
4
,
scope
=
None
,
):
self
.
parent
=
parent
self
.
batch_size
=
batch_size
self
.
seq_length
=
seq_length
self
.
is_training
=
is_training
self
.
use_token_type_ids
=
use_token_type_ids
self
.
use_input_mask
=
use_input_mask
self
.
use_labels
=
use_labels
self
.
use_mc_token_ids
=
use_mc_token_ids
self
.
vocab_size
=
vocab_size
self
.
hidden_size
=
hidden_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
intermediate_size
=
intermediate_size
self
.
hidden_act
=
hidden_act
self
.
hidden_dropout_prob
=
hidden_dropout_prob
self
.
attention_probs_dropout_prob
=
attention_probs_dropout_prob
self
.
max_position_embeddings
=
max_position_embeddings
self
.
type_vocab_size
=
type_vocab_size
self
.
type_sequence_label_size
=
type_sequence_label_size
self
.
initializer_range
=
initializer_range
self
.
num_labels
=
num_labels
self
.
num_choices
=
num_choices
self
.
scope
=
scope
def
prepare_config_and_inputs
(
self
):
input_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
vocab_size
)
input_mask
=
None
if
self
.
use_input_mask
:
input_mask
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
vocab_size
=
2
)
token_type_ids
=
None
if
self
.
use_token_type_ids
:
token_type_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
type_vocab_size
)
mc_token_ids
=
None
if
self
.
use_mc_token_ids
:
mc_token_ids
=
ids_tensor
([
self
.
batch_size
,
self
.
num_choices
],
self
.
seq_length
)
sequence_labels
=
None
token_labels
=
None
choice_labels
=
None
if
self
.
use_labels
:
sequence_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
type_sequence_label_size
)
token_labels
=
ids_tensor
([
self
.
batch_size
,
self
.
seq_length
],
self
.
num_labels
)
choice_labels
=
ids_tensor
([
self
.
batch_size
],
self
.
num_choices
)
config
=
OpenAIGPTConfig
(
vocab_size_or_config_json_file
=
self
.
vocab_size
,
n_embd
=
self
.
hidden_size
,
n_layer
=
self
.
num_hidden_layers
,
n_head
=
self
.
num_attention_heads
,
# intermediate_size=self.intermediate_size,
# hidden_act=self.hidden_act,
# hidden_dropout_prob=self.hidden_dropout_prob,
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
n_positions
=
self
.
max_position_embeddings
,
n_ctx
=
self
.
max_position_embeddings
# type_vocab_size=self.type_vocab_size,
# initializer_range=self.initializer_range
)
head_mask
=
ids_tensor
([
self
.
num_hidden_layers
,
self
.
num_attention_heads
],
2
)
return
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
mc_token_ids
,
sequence_labels
,
token_labels
,
choice_labels
def
create_and_check_openai_gpt_model
(
self
,
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
*
args
):
model
=
TFOpenAIGPTModel
(
config
=
config
)
inputs
=
{
'input_ids'
:
input_ids
,
'attention_mask'
:
input_mask
,
'token_type_ids'
:
token_type_ids
}
sequence_output
=
model
(
inputs
)[
0
]
inputs
=
[
input_ids
,
input_mask
]
sequence_output
=
model
(
inputs
)[
0
]
sequence_output
=
model
(
input_ids
)[
0
]
result
=
{
"sequence_output"
:
sequence_output
.
numpy
(),
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"sequence_output"
].
shape
),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
hidden_size
])
def
create_and_check_openai_gpt_lm_head
(
self
,
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
*
args
):
model
=
TFOpenAIGPTLMHeadModel
(
config
=
config
)
inputs
=
{
'input_ids'
:
input_ids
,
'attention_mask'
:
input_mask
,
'token_type_ids'
:
token_type_ids
}
prediction_scores
=
model
(
inputs
)[
0
]
result
=
{
"prediction_scores"
:
prediction_scores
.
numpy
(),
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"prediction_scores"
].
shape
),
[
self
.
batch_size
,
self
.
seq_length
,
self
.
vocab_size
])
def
create_and_check_openai_gpt_double_head
(
self
,
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
mc_token_ids
,
*
args
):
model
=
TFOpenAIGPTDoubleHeadsModel
(
config
=
config
)
multiple_choice_inputs_ids
=
tf
.
tile
(
tf
.
expand_dims
(
input_ids
,
1
),
(
1
,
self
.
num_choices
,
1
))
multiple_choice_input_mask
=
tf
.
tile
(
tf
.
expand_dims
(
input_mask
,
1
),
(
1
,
self
.
num_choices
,
1
))
multiple_choice_token_type_ids
=
tf
.
tile
(
tf
.
expand_dims
(
token_type_ids
,
1
),
(
1
,
self
.
num_choices
,
1
))
inputs
=
{
'input_ids'
:
multiple_choice_inputs_ids
,
'mc_token_ids'
:
mc_token_ids
,
'attention_mask'
:
multiple_choice_input_mask
,
'token_type_ids'
:
multiple_choice_token_type_ids
}
lm_logits
,
mc_logits
=
model
(
inputs
)[:
2
]
result
=
{
"lm_logits"
:
lm_logits
.
numpy
(),
"mc_logits"
:
mc_logits
.
numpy
()
}
self
.
parent
.
assertListEqual
(
list
(
result
[
"lm_logits"
].
shape
),
[
self
.
batch_size
,
self
.
num_choices
,
self
.
seq_length
,
self
.
vocab_size
])
self
.
parent
.
assertListEqual
(
list
(
result
[
"mc_logits"
].
shape
),
[
self
.
batch_size
,
self
.
num_choices
])
def
prepare_config_and_inputs_for_common
(
self
):
config_and_inputs
=
self
.
prepare_config_and_inputs
()
(
config
,
input_ids
,
input_mask
,
head_mask
,
token_type_ids
,
mc_token_ids
,
sequence_labels
,
token_labels
,
choice_labels
)
=
config_and_inputs
inputs_dict
=
{
'input_ids'
:
input_ids
,
'token_type_ids'
:
token_type_ids
,
'attention_mask'
:
input_mask
}
return
config
,
inputs_dict
def
setUp
(
self
):
self
.
model_tester
=
TFOpenAIGPTModelTest
.
TFOpenAIGPTModelTester
(
self
)
self
.
config_tester
=
ConfigTester
(
self
,
config_class
=
OpenAIGPTConfig
,
n_embd
=
37
)
def
test_config
(
self
):
self
.
config_tester
.
run_common_tests
()
def
test_openai_gpt_model
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_openai_gpt_model
(
*
config_and_inputs
)
def
test_openai_gpt_lm_head
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_openai_gpt_lm_head
(
*
config_and_inputs
)
def
test_openai_gpt_double_head
(
self
):
config_and_inputs
=
self
.
model_tester
.
prepare_config_and_inputs
()
self
.
model_tester
.
create_and_check_openai_gpt_double_head
(
*
config_and_inputs
)
@
pytest
.
mark
.
slow
def
test_model_from_pretrained
(
self
):
cache_dir
=
"/tmp/pytorch_transformers_test/"
for
model_name
in
list
(
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
.
keys
())[:
1
]:
model
=
TFOpenAIGPTModel
.
from_pretrained
(
model_name
,
cache_dir
=
cache_dir
)
shutil
.
rmtree
(
cache_dir
)
self
.
assertIsNotNone
(
model
)
if
__name__
==
"__main__"
:
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment