Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
fbe04423
Commit
fbe04423
authored
Jul 04, 2019
by
thomwolf
Browse files
Common SequenceSummary class
parent
c22545aa
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
130 additions
and
125 deletions
+130
-125
pytorch_pretrained_bert/__init__.py
pytorch_pretrained_bert/__init__.py
+1
-1
pytorch_pretrained_bert/model_utils.py
pytorch_pretrained_bert/model_utils.py
+89
-19
pytorch_pretrained_bert/modeling_gpt2.py
pytorch_pretrained_bert/modeling_gpt2.py
+14
-34
pytorch_pretrained_bert/modeling_openai.py
pytorch_pretrained_bert/modeling_openai.py
+16
-34
pytorch_pretrained_bert/modeling_xlnet.py
pytorch_pretrained_bert/modeling_xlnet.py
+10
-37
No files found.
pytorch_pretrained_bert/__init__.py
View file @
fbe04423
...
...
@@ -17,7 +17,7 @@ from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel,
from
.modeling_transfo_xl
import
(
TransfoXLConfig
,
TransfoXLModel
,
TransfoXLLMHeadModel
,
load_tf_weights_in_transfo_xl
)
from
.modeling_gpt2
import
(
GPT2Config
,
GPT2Model
,
GPT2LMHeadModel
,
GPT2DoubleHeadsModel
,
GPT2MultipleChoiceHead
,
GPT2LMHeadModel
,
GPT2DoubleHeadsModel
,
load_tf_weights_in_gpt2
)
from
.modeling_xlnet
import
(
XLNetConfig
,
XLNetPreTrainedModel
,
XLNetModel
,
XLNetLMHeadModel
,
...
...
pytorch_pretrained_bert/model_utils.py
View file @
fbe04423
...
...
@@ -282,6 +282,95 @@ class PreTrainedModel(nn.Module):
return
model
class
Conv1D
(
nn
.
Module
):
def
__init__
(
self
,
nf
,
nx
):
""" Conv1D layer as defined by Alec for GPT (and also used in GPT-2)
Basically works like a Linear layer but the weights are transposed
"""
super
(
Conv1D
,
self
).
__init__
()
self
.
nf
=
nf
w
=
torch
.
empty
(
nx
,
nf
)
nn
.
init
.
normal_
(
w
,
std
=
0.02
)
self
.
weight
=
nn
.
Parameter
(
w
)
self
.
bias
=
nn
.
Parameter
(
torch
.
zeros
(
nf
))
def
forward
(
self
,
x
):
size_out
=
x
.
size
()[:
-
1
]
+
(
self
.
nf
,)
x
=
torch
.
addmm
(
self
.
bias
,
x
.
view
(
-
1
,
x
.
size
(
-
1
)),
self
.
weight
)
x
=
x
.
view
(
*
size_out
)
return
x
class
SequenceSummary
(
nn
.
Module
):
def
__init__
(
self
,
config
):
""" Compute a single vector summary of a sequence hidden states according to various possibilities:
Args of the config class:
summary_type:
- 'last' => [default] take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states
- 'token_ids' => supply a Tensor of classification token indices (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention
summary_use_proj: Add a projection after the vector extraction
summary_num_classes: If > 0: the projection outputs to n classes (otherwise to hidden_size)
summary_activation:
'tanh' => add a tanh activation to the output
None => no activation
"""
super
(
SequenceSummary
,
self
).
__init__
()
self
.
summary_type
=
config
.
summary_type
if
hasattr
(
config
,
'summary_use_proj'
)
else
'last'
if
config
.
summary_type
==
'attn'
:
# We should use a standard multi-head attention module with absolute positional embedding for that.
# Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
# We can probably just use the multi-head attention module of PyTorch >=1.1.0
raise
NotImplementedError
self
.
summary
=
nn
.
Identity
()
if
hasattr
(
config
,
'summary_use_proj'
)
and
config
.
summary_use_proj
:
if
hasattr
(
config
,
'summary_num_classes'
)
and
config
.
summary_num_classes
>
0
:
num_classes
=
config
.
summary_num_classes
else
:
num_classes
=
config
.
hidden_size
self
.
summary
=
nn
.
Linear
(
config
.
hidden_size
,
num_classes
)
self
.
activation
=
nn
.
Identity
()
if
hasattr
(
config
,
'summary_activation'
)
and
config
.
summary_activation
==
'tanh'
:
self
.
activation
=
nn
.
Tanh
()
self
.
dropout
=
nn
.
Dropout
(
config
.
summary_dropout
)
def
forward
(
self
,
hidden_states
,
token_ids
=
None
):
""" hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
token_ids: [optional] index of the classification token if summary_type == 'token_ids',
shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
if summary_type == 'token_ids' and token_ids is None:
we take the last token of the sequence as classification token
"""
if
self
.
summary_type
==
'last'
:
output
=
hidden_states
[:,
-
1
]
elif
self
.
summary_type
==
'first'
:
output
=
hidden_states
[:,
0
]
elif
self
.
summary_type
==
'mean'
:
output
=
hidden_states
.
mean
(
dim
=
1
)
elif
self
.
summary_type
==
'token_ids'
:
if
token_ids
is
None
:
token_ids
=
torch
.
full_like
(
hidden_states
[...,
:
1
,
:],
hidden_states
.
shape
[
-
2
]
-
1
,
dtype
=
torch
.
long
)
else
:
token_ids
=
token_ids
.
unsqueeze
(
-
1
).
unsqueeze
(
-
1
)
token_ids
=
token_ids
.
expand
((
-
1
,)
*
(
token_ids
.
dim
()
-
1
)
+
(
hidden_states
.
size
(
-
1
),))
# shape of token_ids: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
output
=
hidden_states
.
gather
(
-
2
,
token_ids
).
squeeze
(
-
2
)
# shape (bsz, XX, hidden_size)
elif
self
.
summary_type
==
'attn'
:
raise
NotImplementedError
output
=
self
.
summary
(
output
)
output
=
self
.
activation
(
output
)
output
=
self
.
dropout
(
output
)
return
output
def
prune_linear_layer
(
layer
,
index
,
dim
=
0
):
""" Prune a linear layer (a model parameters) to keep only entries in index.
Return the pruned layer as a new layer with requires_grad=True.
...
...
@@ -307,25 +396,6 @@ def prune_linear_layer(layer, index, dim=0):
return
new_layer
class
Conv1D
(
nn
.
Module
):
""" Conv1D layer as defined by Alec Radford for GPT (and also used in GPT-2)
Basically works like a Linear layer but the weights are transposed
"""
def
__init__
(
self
,
nf
,
nx
):
super
(
Conv1D
,
self
).
__init__
()
self
.
nf
=
nf
w
=
torch
.
empty
(
nx
,
nf
)
nn
.
init
.
normal_
(
w
,
std
=
0.02
)
self
.
weight
=
nn
.
Parameter
(
w
)
self
.
bias
=
nn
.
Parameter
(
torch
.
zeros
(
nf
))
def
forward
(
self
,
x
):
size_out
=
x
.
size
()[:
-
1
]
+
(
self
.
nf
,)
x
=
torch
.
addmm
(
self
.
bias
,
x
.
view
(
-
1
,
x
.
size
(
-
1
)),
self
.
weight
)
x
=
x
.
view
(
*
size_out
)
return
x
def
prune_conv1d_layer
(
layer
,
index
,
dim
=
1
):
""" Prune a Conv1D layer (a model parameters) to keep only entries in index.
A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed.
...
...
pytorch_pretrained_bert/modeling_gpt2.py
View file @
fbe04423
...
...
@@ -31,7 +31,8 @@ from torch.nn import CrossEntropyLoss
from
torch.nn.parameter
import
Parameter
from
.file_utils
import
cached_path
from
.model_utils
import
Conv1D
,
CONFIG_NAME
,
WEIGHTS_NAME
,
PretrainedConfig
,
PreTrainedModel
,
prune_conv1d_layer
from
.model_utils
import
(
Conv1D
,
CONFIG_NAME
,
WEIGHTS_NAME
,
PretrainedConfig
,
PreTrainedModel
,
prune_conv1d_layer
,
SequenceSummary
)
from
.modeling_bert
import
BertLayerNorm
as
LayerNorm
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -119,6 +120,11 @@ class GPT2Config(PretrainedConfig):
layer_norm_epsilon
=
1e-5
,
initializer_range
=
0.02
,
predict_special_tokens
=
True
,
summary_type
=
'token_ids'
,
summary_use_proj
=
True
,
summary_num_classes
=
1
,
summary_activation
=
None
,
summary_dropout
=
0.1
,
**
kwargs
):
"""Constructs GPT2Config.
...
...
@@ -164,6 +170,11 @@ class GPT2Config(PretrainedConfig):
self
.
layer_norm_epsilon
=
layer_norm_epsilon
self
.
initializer_range
=
initializer_range
self
.
predict_special_tokens
=
predict_special_tokens
self
.
summary_type
=
summary_type
self
.
summary_use_proj
=
summary_use_proj
self
.
summary_num_classes
=
summary_num_classes
self
.
summary_activation
=
summary_activation
self
.
summary_dropout
=
summary_dropout
else
:
raise
ValueError
(
"First argument must be either a vocabulary size (int)"
...
...
@@ -342,37 +353,6 @@ class GPT2LMHead(nn.Module):
return
lm_logits
class
GPT2MultipleChoiceHead
(
nn
.
Module
):
""" Classifier Head for the transformer """
def
__init__
(
self
,
config
):
super
(
GPT2MultipleChoiceHead
,
self
).
__init__
()
self
.
n_embd
=
config
.
n_embd
self
.
dropout
=
nn
.
Dropout2d
(
config
.
resid_pdrop
)
# To reproduce the noise_shape parameter of TF implementation
self
.
linear
=
nn
.
Linear
(
config
.
n_embd
,
1
)
nn
.
init
.
normal_
(
self
.
linear
.
weight
,
std
=
0.02
)
nn
.
init
.
normal_
(
self
.
linear
.
bias
,
0
)
def
forward
(
self
,
hidden_states
,
mc_token_ids
=
None
):
""" Extract classification token hidden state and project it using self.linear
hidden_state: shape (bsz, num_choices, seq_length, hidden_size)
mc_token_ids: [optional] index of the classification token, shape (bsz, num_choices)
if mc_token_ids=None we take the last token of the sequence as classification token
"""
if
mc_token_ids
is
None
:
mc_token_ids
=
torch
.
full_like
(
hidden_states
[:,
:,
:
1
,
:],
hidden_states
.
shape
[
2
]
-
1
,
dtype
=
torch
.
long
)
else
:
mc_token_ids
=
mc_token_ids
.
unsqueeze
(
-
1
).
unsqueeze
(
-
1
).
expand
(
-
1
,
-
1
,
-
1
,
hidden_states
.
size
(
-
1
))
# mc_token_ids has shape (bsz, num_choices, 1, hidden_size)
multiple_choice_h
=
hidden_states
.
gather
(
2
,
mc_token_ids
).
squeeze
(
2
)
# (bsz, num_choices, hidden_size)
multiple_choice_h
=
self
.
dropout
(
multiple_choice_h
.
transpose
(
1
,
2
)).
transpose
(
1
,
2
)
multiple_choice_logits
=
self
.
linear
(
multiple_choice_h
).
squeeze
(
-
1
)
# (bsz, num_choices)
return
multiple_choice_logits
class
GPT2PreTrainedModel
(
PreTrainedModel
):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
...
...
@@ -735,7 +715,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
super
(
GPT2DoubleHeadsModel
,
self
).
__init__
(
config
)
self
.
transformer
=
GPT2Model
(
config
)
self
.
lm_head
=
GPT2LMHead
(
self
.
transformer
.
wte
.
weight
,
config
)
self
.
multiple_choice_head
=
GPT2MultipleChoiceHead
(
config
)
self
.
multiple_choice_head
=
SequenceSummary
(
config
)
self
.
apply
(
self
.
init_weights
)
...
...
@@ -753,7 +733,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
hidden_states
=
transformer_outputs
[
0
]
lm_logits
=
self
.
lm_head
(
hidden_states
)
mc_logits
=
self
.
multiple_choice_head
(
hidden_states
,
mc_token_ids
)
mc_logits
=
self
.
multiple_choice_head
(
hidden_states
,
mc_token_ids
)
.
squeeze
(
-
1
)
outputs
=
(
lm_logits
,
mc_logits
)
+
transformer_outputs
[
1
:]
if
mc_labels
is
not
None
:
...
...
pytorch_pretrained_bert/modeling_openai.py
View file @
fbe04423
...
...
@@ -31,7 +31,8 @@ from torch.nn import CrossEntropyLoss
from
torch.nn.parameter
import
Parameter
from
.file_utils
import
cached_path
from
.model_utils
import
Conv1D
,
CONFIG_NAME
,
WEIGHTS_NAME
,
PretrainedConfig
,
PreTrainedModel
,
prune_conv1d_layer
from
.model_utils
import
(
Conv1D
,
CONFIG_NAME
,
WEIGHTS_NAME
,
PretrainedConfig
,
PreTrainedModel
,
prune_conv1d_layer
,
SequenceSummary
)
from
.modeling_bert
import
BertLayerNorm
as
LayerNorm
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -147,6 +148,11 @@ class OpenAIGPTConfig(PretrainedConfig):
layer_norm_epsilon
=
1e-5
,
initializer_range
=
0.02
,
predict_special_tokens
=
True
,
summary_type
=
'token_ids'
,
summary_use_proj
=
True
,
summary_num_classes
=
1
,
summary_activation
=
None
,
summary_dropout
=
0.1
,
**
kwargs
):
"""Constructs OpenAIGPTConfig.
...
...
@@ -195,6 +201,11 @@ class OpenAIGPTConfig(PretrainedConfig):
self
.
layer_norm_epsilon
=
layer_norm_epsilon
self
.
initializer_range
=
initializer_range
self
.
predict_special_tokens
=
predict_special_tokens
self
.
summary_type
=
summary_type
self
.
summary_use_proj
=
summary_use_proj
self
.
summary_num_classes
=
summary_num_classes
self
.
summary_activation
=
summary_activation
self
.
summary_dropout
=
summary_dropout
else
:
raise
ValueError
(
"First argument must be either a vocabulary size (int)"
...
...
@@ -368,37 +379,6 @@ class OpenAIGPTLMHead(nn.Module):
return
lm_logits
class
OpenAIGPTMultipleChoiceHead
(
nn
.
Module
):
""" Classifier Head for the transformer """
def
__init__
(
self
,
config
):
super
(
OpenAIGPTMultipleChoiceHead
,
self
).
__init__
()
self
.
n_embd
=
config
.
n_embd
self
.
dropout
=
nn
.
Dropout2d
(
config
.
resid_pdrop
)
# To reproduce the noise_shape parameter of TF implementation
self
.
linear
=
nn
.
Linear
(
config
.
n_embd
,
1
)
nn
.
init
.
normal_
(
self
.
linear
.
weight
,
std
=
0.02
)
nn
.
init
.
normal_
(
self
.
linear
.
bias
,
0
)
def
forward
(
self
,
hidden_states
,
mc_token_ids
=
None
):
""" Extract classification token hidden state and project it using self.linear
hidden_state: hidden state of shape (bsz, num_choices, seq_length, hidden_size)
mc_token_ids: [optional] index of the classification token, shape (bsz, num_choices)
if mc_token_ids=None we take the last token of the sequence as classification token
"""
if
mc_token_ids
is
None
:
mc_token_ids
=
torch
.
full_like
(
hidden_states
[:,
:,
:
1
,
:],
hidden_states
.
shape
[
2
]
-
1
,
dtype
=
torch
.
long
)
else
:
mc_token_ids
=
mc_token_ids
.
unsqueeze
(
-
1
).
unsqueeze
(
-
1
).
expand
(
-
1
,
-
1
,
-
1
,
hidden_states
.
size
(
-
1
))
# (bsz, num_choices, 1, hidden_size)
multiple_choice_h
=
hidden_states
.
gather
(
2
,
mc_token_ids
).
squeeze
(
2
)
# (bsz, num_choices, hidden_size)
multiple_choice_h
=
self
.
dropout
(
multiple_choice_h
.
transpose
(
1
,
2
)).
transpose
(
1
,
2
)
multiple_choice_logits
=
self
.
linear
(
multiple_choice_h
).
squeeze
(
-
1
)
# (bsz, num_choices)
return
multiple_choice_logits
class
OpenAIGPTPreTrainedModel
(
PreTrainedModel
):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
...
...
@@ -768,9 +748,11 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
def
__init__
(
self
,
config
):
super
(
OpenAIGPTDoubleHeadsModel
,
self
).
__init__
(
config
)
self
.
transformer
=
OpenAIGPTModel
(
config
)
self
.
lm_head
=
OpenAIGPTLMHead
(
self
.
transformer
.
tokens_embed
.
weight
,
config
)
self
.
multiple_choice_head
=
OpenAIGPTMultipleChoiceHead
(
config
)
self
.
multiple_choice_head
=
SequenceSummary
(
config
)
self
.
apply
(
self
.
init_weights
)
def
set_num_special_tokens
(
self
,
num_special_tokens
,
predict_special_tokens
=
True
):
...
...
@@ -787,7 +769,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
hidden_states
=
transformer_outputs
[
0
]
lm_logits
=
self
.
lm_head
(
hidden_states
)
mc_logits
=
self
.
multiple_choice_head
(
hidden_states
,
mc_token_ids
)
mc_logits
=
self
.
multiple_choice_head
(
hidden_states
,
mc_token_ids
)
.
squeeze
(
-
1
)
outputs
=
(
lm_logits
,
mc_logits
)
+
transformer_outputs
[
1
:]
if
mc_labels
is
not
None
:
...
...
pytorch_pretrained_bert/modeling_xlnet.py
View file @
fbe04423
...
...
@@ -32,7 +32,8 @@ from torch.nn import functional as F
from
torch.nn
import
CrossEntropyLoss
,
MSELoss
from
.file_utils
import
cached_path
from
.model_utils
import
CONFIG_NAME
,
WEIGHTS_NAME
,
PretrainedConfig
,
PreTrainedModel
from
.model_utils
import
(
CONFIG_NAME
,
WEIGHTS_NAME
,
PretrainedConfig
,
PreTrainedModel
,
SequenceSummary
)
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -223,8 +224,10 @@ class XLNetConfig(PretrainedConfig):
finetuning_task
=
None
,
num_labels
=
2
,
summary_type
=
"last"
,
use_proj
=
True
,
summary_type
=
'last'
,
summary_use_proj
=
True
,
summary_activation
=
'tanh'
,
summary_dropout
=
0.1
,
**
kwargs
):
"""Constructs XLNetConfig.
...
...
@@ -307,7 +310,9 @@ class XLNetConfig(PretrainedConfig):
self
.
finetuning_task
=
finetuning_task
self
.
num_labels
=
num_labels
self
.
summary_type
=
summary_type
self
.
use_proj
=
use_proj
self
.
summary_use_proj
=
summary_use_proj
self
.
summary_activation
=
summary_activation
self
.
summary_dropout
=
summary_dropout
else
:
raise
ValueError
(
"First argument must be either a vocabulary size (int)"
"or the path to a pretrained model config file (str)"
)
...
...
@@ -1042,38 +1047,6 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
return
outputs
# return (loss), logits, (mems), (hidden states), (attentions)
class
XLNetSequenceSummary
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
(
XLNetSequenceSummary
,
self
).
__init__
()
self
.
summary_type
=
config
.
summary_type
if
config
.
use_proj
:
self
.
summary
=
nn
.
Linear
(
config
.
d_model
,
config
.
d_model
)
else
:
self
.
summary
=
None
if
config
.
summary_type
==
'attn'
:
# We should use a standard multi-head attention module with absolute positional embedding for that.
# Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
# We can probably just use the multi-head attention module of PyTorch >=1.1.0
raise
NotImplementedError
self
.
dropout
=
nn
.
Dropout
(
config
.
dropout
)
self
.
activation
=
nn
.
Tanh
()
def
forward
(
self
,
hidden_states
):
""" hidden_states: float Tensor in shape [bsz, seq_len, d_model], the hidden-states of the last layer."""
if
self
.
summary_type
==
'last'
:
output
=
hidden_states
[:,
-
1
]
elif
self
.
summary_type
==
'first'
:
output
=
hidden_states
[:,
0
]
elif
self
.
summary_type
==
'mean'
:
output
=
hidden_states
.
mean
(
dim
=
1
)
elif
self
.
summary_type
==
'attn'
:
raise
NotImplementedError
output
=
self
.
summary
(
output
)
output
=
self
.
activation
(
output
)
output
=
self
.
dropout
(
output
)
return
output
class
XLNetForSequenceClassification
(
XLNetPreTrainedModel
):
"""XLNet model ("XLNet: Generalized Autoregressive Pretraining for Language Understanding").
...
...
@@ -1143,7 +1116,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
super
(
XLNetForSequenceClassification
,
self
).
__init__
(
config
)
self
.
transformer
=
XLNetModel
(
config
)
self
.
sequence_summary
=
XLNet
SequenceSummary
(
config
)
self
.
sequence_summary
=
SequenceSummary
(
config
)
self
.
logits_proj
=
nn
.
Linear
(
config
.
d_model
,
config
.
num_labels
)
self
.
apply
(
self
.
init_weights
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment