Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
93f563b8
Commit
93f563b8
authored
Jan 07, 2019
by
thomwolf
Browse files
adding OpenAI GPT
parent
8da280eb
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
699 additions
and
11 deletions
+699
-11
pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
...h_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
+174
-0
pytorch_pretrained_bert/modeling.py
pytorch_pretrained_bert/modeling.py
+11
-11
pytorch_pretrained_bert/modeling_openai.py
pytorch_pretrained_bert/modeling_openai.py
+302
-0
pytorch_pretrained_bert/optimization_openai.py
pytorch_pretrained_bert/optimization_openai.py
+104
-0
pytorch_pretrained_bert/tokenization_openai.py
pytorch_pretrained_bert/tokenization_openai.py
+108
-0
No files found.
pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
0 → 100755
View file @
93f563b8
# coding=utf-8
# Copyright 2018 The HugginFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert BERT checkpoint."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
os
import
re
import
argparse
import
tensorflow
as
tf
import
torch
import
numpy
as
np
from
.modeling
import
BertConfig
,
BertForPreTraining
def
convert_openai_checkpoint_to_pytorch
(
open_checkpoint_folder_path
,
openai_config_file
,
pytorch_dump_path
):
def
load_openai_pretrained_model
(
model
,
n_ctx
=-
1
,
n_special
=-
1
,
n_transfer
=
12
,
n_embd
=
768
,
path
=
'./model/'
,
path_names
=
'./'
):
# Load weights from TF model
print
(
"Loading weights..."
)
names
=
json
.
load
(
open
(
path_names
+
'parameters_names.json'
))
shapes
=
json
.
load
(
open
(
path
+
'params_shapes.json'
))
offsets
=
np
.
cumsum
([
np
.
prod
(
shape
)
for
shape
in
shapes
])
init_params
=
[
np
.
load
(
path
+
'params_{}.npy'
.
format
(
n
))
for
n
in
range
(
10
)]
init_params
=
np
.
split
(
np
.
concatenate
(
init_params
,
0
),
offsets
)[:
-
1
]
init_params
=
[
param
.
reshape
(
shape
)
for
param
,
shape
in
zip
(
init_params
,
shapes
)]
if
n_ctx
>
0
:
init_params
[
0
]
=
init_params
[
0
][:
n_ctx
]
if
n_special
>
0
:
init_params
[
0
]
=
np
.
concatenate
(
[
init_params
[
1
],
(
np
.
random
.
randn
(
n_special
,
n_embd
)
*
0.02
).
astype
(
np
.
float32
),
init_params
[
0
]
],
0
)
else
:
init_params
[
0
]
=
np
.
concatenate
(
[
init_params
[
1
],
init_params
[
0
]
],
0
)
del
init_params
[
1
]
if
n_transfer
==
-
1
:
n_transfer
=
0
else
:
n_transfer
=
1
+
n_transfer
*
12
init_params
=
[
arr
.
squeeze
()
for
arr
in
init_params
]
try
:
assert
model
.
embed
.
weight
.
shape
==
init_params
[
0
].
shape
except
AssertionError
as
e
:
e
.
args
+=
(
model
.
embed
.
weight
.
shape
,
init_params
[
0
].
shape
)
raise
model
.
embed
.
weight
.
data
=
torch
.
from_numpy
(
init_params
[
0
])
for
name
,
ip
in
zip
(
names
[
1
:
n_transfer
],
init_params
[
1
:
n_transfer
]):
name
=
name
[
6
:]
# skip "model/"
assert
name
[
-
2
:]
==
":0"
name
=
name
[:
-
2
]
name
=
name
.
split
(
'/'
)
pointer
=
model
for
m_name
in
name
:
if
re
.
fullmatch
(
r
'[A-Za-z]+\d+'
,
m_name
):
l
=
re
.
split
(
r
'(\d+)'
,
m_name
)
else
:
l
=
[
m_name
]
pointer
=
getattr
(
pointer
,
l
[
0
])
if
len
(
l
)
>=
2
:
num
=
int
(
l
[
1
])
pointer
=
pointer
[
num
]
try
:
assert
pointer
.
shape
==
ip
.
shape
except
AssertionError
as
e
:
e
.
args
+=
(
pointer
.
shape
,
ip
.
shape
)
raise
pointer
.
data
=
torch
.
from_numpy
(
ip
)
def
convert_tf_checkpoint_to_pytorch
(
tf_checkpoint_path
,
bert_config_file
,
pytorch_dump_path
):
config_path
=
os
.
path
.
abspath
(
bert_config_file
)
tf_path
=
os
.
path
.
abspath
(
tf_checkpoint_path
)
print
(
"Converting TensorFlow checkpoint from {} with config at {}"
.
format
(
tf_path
,
config_path
))
# Load weights from TF model
init_vars
=
tf
.
train
.
list_variables
(
tf_path
)
names
=
[]
arrays
=
[]
for
name
,
shape
in
init_vars
:
print
(
"Loading TF weight {} with shape {}"
.
format
(
name
,
shape
))
array
=
tf
.
train
.
load_variable
(
tf_path
,
name
)
names
.
append
(
name
)
arrays
.
append
(
array
)
# Initialise PyTorch model
config
=
BertConfig
.
from_json_file
(
bert_config_file
)
print
(
"Building PyTorch model from configuration: {}"
.
format
(
str
(
config
)))
model
=
BertForPreTraining
(
config
)
for
name
,
array
in
zip
(
names
,
arrays
):
name
=
name
.
split
(
'/'
)
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model
if
any
(
n
in
[
"adam_v"
,
"adam_m"
]
for
n
in
name
):
print
(
"Skipping {}"
.
format
(
"/"
.
join
(
name
)))
continue
pointer
=
model
for
m_name
in
name
:
if
re
.
fullmatch
(
r
'[A-Za-z]+_\d+'
,
m_name
):
l
=
re
.
split
(
r
'_(\d+)'
,
m_name
)
else
:
l
=
[
m_name
]
if
l
[
0
]
==
'kernel'
or
l
[
0
]
==
'gamma'
:
pointer
=
getattr
(
pointer
,
'weight'
)
elif
l
[
0
]
==
'output_bias'
or
l
[
0
]
==
'beta'
:
pointer
=
getattr
(
pointer
,
'bias'
)
elif
l
[
0
]
==
'output_weights'
:
pointer
=
getattr
(
pointer
,
'weight'
)
else
:
pointer
=
getattr
(
pointer
,
l
[
0
])
if
len
(
l
)
>=
2
:
num
=
int
(
l
[
1
])
pointer
=
pointer
[
num
]
if
m_name
[
-
11
:]
==
'_embeddings'
:
pointer
=
getattr
(
pointer
,
'weight'
)
elif
m_name
==
'kernel'
:
array
=
np
.
transpose
(
array
)
try
:
assert
pointer
.
shape
==
array
.
shape
except
AssertionError
as
e
:
e
.
args
+=
(
pointer
.
shape
,
array
.
shape
)
raise
print
(
"Initialize PyTorch weight {}"
.
format
(
name
))
pointer
.
data
=
torch
.
from_numpy
(
array
)
# Save pytorch-model
print
(
"Save PyTorch model to {}"
.
format
(
pytorch_dump_path
))
torch
.
save
(
model
.
state_dict
(),
pytorch_dump_path
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
## Required parameters
parser
.
add_argument
(
"--tf_checkpoint_path"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Path the TensorFlow checkpoint path."
)
parser
.
add_argument
(
"--bert_config_file"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"The config json file corresponding to the pre-trained BERT model.
\n
"
"This specifies the model architecture."
)
parser
.
add_argument
(
"--pytorch_dump_path"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Path to the output PyTorch model."
)
args
=
parser
.
parse_args
()
convert_tf_checkpoint_to_pytorch
(
args
.
tf_checkpoint_path
,
args
.
bert_config_file
,
args
.
pytorch_dump_path
)
pytorch_pretrained_bert/modeling.py
View file @
93f563b8
...
@@ -416,12 +416,12 @@ class BertPreTrainingHeads(nn.Module):
...
@@ -416,12 +416,12 @@ class BertPreTrainingHeads(nn.Module):
return
prediction_scores
,
seq_relationship_score
return
prediction_scores
,
seq_relationship_score
class
PreTrained
Bert
Model
(
nn
.
Module
):
class
PreTrainedModel
(
nn
.
Module
):
""" An abstract class to handle weights initialization and
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
a simple interface for dowloading and loading pretrained models.
"""
"""
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
def
__init__
(
self
,
config
,
*
inputs
,
**
kwargs
):
super
(
PreTrained
Bert
Model
,
self
).
__init__
()
super
(
PreTrainedModel
,
self
).
__init__
()
if
not
isinstance
(
config
,
BertConfig
):
if
not
isinstance
(
config
,
BertConfig
):
raise
ValueError
(
raise
ValueError
(
"Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
"Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
...
@@ -447,7 +447,7 @@ class PreTrainedBertModel(nn.Module):
...
@@ -447,7 +447,7 @@ class PreTrainedBertModel(nn.Module):
@
classmethod
@
classmethod
def
from_pretrained
(
cls
,
pretrained_model_name
,
state_dict
=
None
,
cache_dir
=
None
,
*
inputs
,
**
kwargs
):
def
from_pretrained
(
cls
,
pretrained_model_name
,
state_dict
=
None
,
cache_dir
=
None
,
*
inputs
,
**
kwargs
):
"""
"""
Instantiate a PreTrained
Bert
Model from a pre-trained model file or a pytorch state dict.
Instantiate a PreTrainedModel from a pre-trained model file or a pytorch state dict.
Download and cache the pre-trained model file if needed.
Download and cache the pre-trained model file if needed.
Params:
Params:
...
@@ -551,7 +551,7 @@ class PreTrainedBertModel(nn.Module):
...
@@ -551,7 +551,7 @@ class PreTrainedBertModel(nn.Module):
return
model
return
model
class
BertModel
(
PreTrained
Bert
Model
):
class
BertModel
(
PreTrainedModel
):
"""BERT model ("Bidirectional Embedding Representations from a Transformer").
"""BERT model ("Bidirectional Embedding Representations from a Transformer").
Params:
Params:
...
@@ -634,7 +634,7 @@ class BertModel(PreTrainedBertModel):
...
@@ -634,7 +634,7 @@ class BertModel(PreTrainedBertModel):
return
encoded_layers
,
pooled_output
return
encoded_layers
,
pooled_output
class
BertForPreTraining
(
PreTrained
Bert
Model
):
class
BertForPreTraining
(
PreTrainedModel
):
"""BERT model with pre-training heads.
"""BERT model with pre-training heads.
This module comprises the BERT model followed by the two pre-training heads:
This module comprises the BERT model followed by the two pre-training heads:
- the masked language modeling head, and
- the masked language modeling head, and
...
@@ -705,7 +705,7 @@ class BertForPreTraining(PreTrainedBertModel):
...
@@ -705,7 +705,7 @@ class BertForPreTraining(PreTrainedBertModel):
return
prediction_scores
,
seq_relationship_score
return
prediction_scores
,
seq_relationship_score
class
BertForMaskedLM
(
PreTrained
Bert
Model
):
class
BertForMaskedLM
(
PreTrainedModel
):
"""BERT model with the masked language modeling head.
"""BERT model with the masked language modeling head.
This module comprises the BERT model followed by the masked language modeling head.
This module comprises the BERT model followed by the masked language modeling head.
...
@@ -766,7 +766,7 @@ class BertForMaskedLM(PreTrainedBertModel):
...
@@ -766,7 +766,7 @@ class BertForMaskedLM(PreTrainedBertModel):
return
prediction_scores
return
prediction_scores
class
BertForNextSentencePrediction
(
PreTrained
Bert
Model
):
class
BertForNextSentencePrediction
(
PreTrainedModel
):
"""BERT model with next sentence prediction head.
"""BERT model with next sentence prediction head.
This module comprises the BERT model followed by the next sentence classification head.
This module comprises the BERT model followed by the next sentence classification head.
...
@@ -828,7 +828,7 @@ class BertForNextSentencePrediction(PreTrainedBertModel):
...
@@ -828,7 +828,7 @@ class BertForNextSentencePrediction(PreTrainedBertModel):
return
seq_relationship_score
return
seq_relationship_score
class
BertForSequenceClassification
(
PreTrained
Bert
Model
):
class
BertForSequenceClassification
(
PreTrainedModel
):
"""BERT model for classification.
"""BERT model for classification.
This module is composed of the BERT model with a linear layer on top of
This module is composed of the BERT model with a linear layer on top of
the pooled output.
the pooled output.
...
@@ -894,7 +894,7 @@ class BertForSequenceClassification(PreTrainedBertModel):
...
@@ -894,7 +894,7 @@ class BertForSequenceClassification(PreTrainedBertModel):
return
logits
return
logits
class
BertForMultipleChoice
(
PreTrained
Bert
Model
):
class
BertForMultipleChoice
(
PreTrainedModel
):
"""BERT model for multiple choice tasks.
"""BERT model for multiple choice tasks.
This module is composed of the BERT model with a linear layer on top of
This module is composed of the BERT model with a linear layer on top of
the pooled output.
the pooled output.
...
@@ -963,7 +963,7 @@ class BertForMultipleChoice(PreTrainedBertModel):
...
@@ -963,7 +963,7 @@ class BertForMultipleChoice(PreTrainedBertModel):
return
reshaped_logits
return
reshaped_logits
class
BertForTokenClassification
(
PreTrained
Bert
Model
):
class
BertForTokenClassification
(
PreTrainedModel
):
"""BERT model for token-level classification.
"""BERT model for token-level classification.
This module is composed of the BERT model with a linear layer on top of
This module is composed of the BERT model with a linear layer on top of
the full hidden state of the last layer.
the full hidden state of the last layer.
...
@@ -1029,7 +1029,7 @@ class BertForTokenClassification(PreTrainedBertModel):
...
@@ -1029,7 +1029,7 @@ class BertForTokenClassification(PreTrainedBertModel):
return
logits
return
logits
class
BertForQuestionAnswering
(
PreTrained
Bert
Model
):
class
BertForQuestionAnswering
(
PreTrainedModel
):
"""BERT model for Question Answering (span extraction).
"""BERT model for Question Answering (span extraction).
This module is composed of the BERT model with a linear layer on top of
This module is composed of the BERT model with a linear layer on top of
the sequence output that computes start_logits and end_logits
the sequence output that computes start_logits and end_logits
...
...
pytorch_pretrained_bert/modeling_openai.py
0 → 100644
View file @
93f563b8
import
copy
import
json
import
math
import
re
import
collections
import
numpy
as
np
import
torch
import
torch.nn
as
nn
from
torch.nn.parameter
import
Parameter
from
.modeling
import
BertLayerNorm
as
LayerNorm
def
gelu
(
x
):
return
0.5
*
x
*
(
1
+
torch
.
tanh
(
math
.
sqrt
(
2
/
math
.
pi
)
*
(
x
+
0.044715
*
torch
.
pow
(
x
,
3
))))
def
swish
(
x
):
return
x
*
torch
.
sigmoid
(
x
)
ACT_FNS
=
{
'relu'
:
nn
.
ReLU
,
'swish'
:
swish
,
'gelu'
:
gelu
}
class
Conv1D
(
nn
.
Module
):
def
__init__
(
self
,
nf
,
rf
,
nx
):
super
(
Conv1D
,
self
).
__init__
()
self
.
rf
=
rf
self
.
nf
=
nf
if
rf
==
1
:
# faster 1x1 conv
w
=
torch
.
empty
(
nx
,
nf
)
nn
.
init
.
normal_
(
w
,
std
=
0.02
)
self
.
w
=
Parameter
(
w
)
self
.
b
=
Parameter
(
torch
.
zeros
(
nf
))
else
:
# was used to train LM
raise
NotImplementedError
def
forward
(
self
,
x
):
if
self
.
rf
==
1
:
size_out
=
x
.
size
()[:
-
1
]
+
(
self
.
nf
,)
x
=
torch
.
addmm
(
self
.
b
,
x
.
view
(
-
1
,
x
.
size
(
-
1
)),
self
.
w
)
x
=
x
.
view
(
*
size_out
)
else
:
raise
NotImplementedError
return
x
class
Attention
(
nn
.
Module
):
def
__init__
(
self
,
nx
,
n_ctx
,
cfg
,
scale
=
False
):
super
(
Attention
,
self
).
__init__
()
n_state
=
nx
# in Attention: n_state=768 (nx=n_embd)
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
assert
n_state
%
cfg
.
n_head
==
0
self
.
register_buffer
(
'b'
,
torch
.
tril
(
torch
.
ones
(
n_ctx
,
n_ctx
)).
view
(
1
,
1
,
n_ctx
,
n_ctx
))
self
.
n_head
=
cfg
.
n_head
self
.
split_size
=
n_state
self
.
scale
=
scale
self
.
c_attn
=
Conv1D
(
n_state
*
3
,
1
,
nx
)
self
.
c_proj
=
Conv1D
(
n_state
,
1
,
nx
)
self
.
attn_dropout
=
nn
.
Dropout
(
cfg
.
attn_pdrop
)
self
.
resid_dropout
=
nn
.
Dropout
(
cfg
.
resid_pdrop
)
def
_attn
(
self
,
q
,
k
,
v
):
w
=
torch
.
matmul
(
q
,
k
)
if
self
.
scale
:
w
=
w
/
math
.
sqrt
(
v
.
size
(
-
1
))
w
=
w
*
self
.
b
+
-
1e9
*
(
1
-
self
.
b
)
# TF implem method: mask_attn_weights
w
=
nn
.
Softmax
(
dim
=-
1
)(
w
)
w
=
self
.
attn_dropout
(
w
)
return
torch
.
matmul
(
w
,
v
)
def
merge_heads
(
self
,
x
):
x
=
x
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
new_x_shape
=
x
.
size
()[:
-
2
]
+
(
x
.
size
(
-
2
)
*
x
.
size
(
-
1
),)
return
x
.
view
(
*
new_x_shape
)
# in Tensorflow implem: fct merge_states
def
split_heads
(
self
,
x
,
k
=
False
):
new_x_shape
=
x
.
size
()[:
-
1
]
+
(
self
.
n_head
,
x
.
size
(
-
1
)
//
self
.
n_head
)
x
=
x
.
view
(
*
new_x_shape
)
# in Tensorflow implem: fct split_states
if
k
:
return
x
.
permute
(
0
,
2
,
3
,
1
)
else
:
return
x
.
permute
(
0
,
2
,
1
,
3
)
def
forward
(
self
,
x
):
x
=
self
.
c_attn
(
x
)
query
,
key
,
value
=
x
.
split
(
self
.
split_size
,
dim
=
2
)
query
=
self
.
split_heads
(
query
)
key
=
self
.
split_heads
(
key
,
k
=
True
)
value
=
self
.
split_heads
(
value
)
a
=
self
.
_attn
(
query
,
key
,
value
)
a
=
self
.
merge_heads
(
a
)
a
=
self
.
c_proj
(
a
)
a
=
self
.
resid_dropout
(
a
)
return
a
class
MLP
(
nn
.
Module
):
def
__init__
(
self
,
n_state
,
cfg
):
# in MLP: n_state=3072 (4 * n_embd)
super
(
MLP
,
self
).
__init__
()
nx
=
cfg
.
n_embd
self
.
c_fc
=
Conv1D
(
n_state
,
1
,
nx
)
self
.
c_proj
=
Conv1D
(
nx
,
1
,
n_state
)
self
.
act
=
ACT_FNS
[
cfg
.
afn
]
self
.
dropout
=
nn
.
Dropout
(
cfg
.
resid_pdrop
)
def
forward
(
self
,
x
):
h
=
self
.
act
(
self
.
c_fc
(
x
))
h2
=
self
.
c_proj
(
h
)
return
self
.
dropout
(
h2
)
class
Block
(
nn
.
Module
):
def
__init__
(
self
,
n_ctx
,
cfg
,
scale
=
False
):
super
(
Block
,
self
).
__init__
()
nx
=
cfg
.
n_embd
self
.
attn
=
Attention
(
nx
,
n_ctx
,
cfg
,
scale
)
self
.
ln_1
=
LayerNorm
(
nx
)
self
.
mlp
=
MLP
(
4
*
nx
,
cfg
)
self
.
ln_2
=
LayerNorm
(
nx
)
def
forward
(
self
,
x
):
a
=
self
.
attn
(
x
)
n
=
self
.
ln_1
(
x
+
a
)
m
=
self
.
mlp
(
n
)
h
=
self
.
ln_2
(
n
+
m
)
return
h
class
TransformerModel
(
nn
.
Module
):
""" Transformer model """
def
__init__
(
self
,
cfg
,
vocab
=
40990
,
n_ctx
=
512
):
super
(
TransformerModel
,
self
).
__init__
()
self
.
vocab
=
vocab
self
.
embed
=
nn
.
Embedding
(
vocab
,
cfg
.
n_embd
)
self
.
drop
=
nn
.
Dropout
(
cfg
.
embd_pdrop
)
block
=
Block
(
n_ctx
,
cfg
,
scale
=
True
)
self
.
h
=
nn
.
ModuleList
([
copy
.
deepcopy
(
block
)
for
_
in
range
(
cfg
.
n_layer
)])
nn
.
init
.
normal_
(
self
.
embed
.
weight
,
std
=
0.02
)
def
forward
(
self
,
x
):
x
=
x
.
view
(
-
1
,
x
.
size
(
-
2
),
x
.
size
(
-
1
))
e
=
self
.
embed
(
x
)
# Add the position information to the input embeddings
h
=
e
.
sum
(
dim
=
2
)
for
block
in
self
.
h
:
h
=
block
(
h
)
return
h
class
LMHead
(
nn
.
Module
):
""" Language Model Head for the transformer """
def
__init__
(
self
,
model
,
cfg
):
super
(
LMHead
,
self
).
__init__
()
self
.
n_embd
=
cfg
.
n_embd
embed_shape
=
model
.
embed
.
weight
.
shape
self
.
decoder
=
nn
.
Linear
(
embed_shape
[
1
],
embed_shape
[
0
],
bias
=
False
)
self
.
decoder
.
weight
=
model
.
embed
.
weight
# Tied weights
def
forward
(
self
,
h
):
# Truncated Language modeling logits (we remove the last token)
h_trunc
=
h
[:,
:
-
1
].
contiguous
().
view
(
-
1
,
self
.
n_embd
)
lm_logits
=
self
.
decoder
(
h_trunc
)
return
lm_logits
class
MultipleChoiceHead
(
nn
.
Module
):
""" Classifier Head for the transformer """
def
__init__
(
self
,
clf_token
,
cfg
):
super
(
MultipleChoiceHead
,
self
).
__init__
()
self
.
n_embd
=
cfg
.
n_embd
self
.
clf_token
=
clf_token
self
.
dropout
=
nn
.
Dropout2d
(
cfg
.
clf_pdrop
)
# To reproduce the noise_shape parameter of TF implementation
self
.
linear
=
nn
.
Linear
(
cfg
.
n_embd
,
1
)
nn
.
init
.
normal_
(
self
.
linear
.
weight
,
std
=
0.02
)
nn
.
init
.
normal_
(
self
.
linear
.
bias
,
0
)
def
forward
(
self
,
h
,
x
):
# Classification logits
clf_h
=
h
.
view
(
-
1
,
self
.
n_embd
)
flat
=
x
[...,
0
].
contiguous
().
view
(
-
1
)
clf_h
=
clf_h
[
flat
==
self
.
clf_token
,
:]
clf_h
=
clf_h
.
view
(
-
1
,
x
.
size
(
1
),
self
.
n_embd
,
1
)
# This double transposition is there to replicate the behavior
# of the noise_shape argument in the tensorflow
# implementation. For more details, see
# https://github.com/huggingface/pytorch-openai-transformer-lm/issues/11
clf_h
=
self
.
dropout
(
clf_h
.
transpose
(
1
,
2
)).
transpose
(
1
,
2
)
clf_h
=
clf_h
.
contiguous
().
view
(
-
1
,
self
.
n_embd
)
clf_logits
=
self
.
linear
(
clf_h
)
return
clf_logits
.
view
(
-
1
,
x
.
size
(
1
))
class
ClfHead
(
nn
.
Module
):
"""Classification Head for the transformer
TODO: test this class."""
def
__init__
(
self
,
clf_token
,
cfg
,
n_class
):
super
(
ClfHead
,
self
).
__init__
()
self
.
n_embd
=
cfg
.
n_embd
self
.
clf_token
=
clf_token
self
.
dropout
=
nn
.
Dropout
(
cfg
.
clf_pdrop
)
self
.
linear
=
nn
.
Linear
(
cfg
.
n_embd
,
n_class
)
nn
.
init
.
normal_
(
self
.
linear
.
weight
,
std
=
0.02
)
nn
.
init
.
normal_
(
self
.
linear
.
bias
,
0
)
def
forward
(
self
,
h
,
x
):
clf_h
=
h
.
view
(
-
1
,
self
.
n_embd
)
flat
=
x
[...,
0
].
contiguous
().
view
(
-
1
)
clf_h
=
clf_h
[
flat
==
self
.
clf_token
,
:]
clf_h
=
self
.
dropout
(
clf_h
)
clf_logits
=
self
.
linear
(
clf_h
)
return
clf_logits
class
SimilarityHead
(
nn
.
Module
):
""" Similarity Head for the transformer
TODO: test this class."""
def
__init__
(
self
,
clf_token
,
cfg
):
super
(
SimilarityHead
,
self
).
__init__
()
self
.
n_embd
=
cfg
.
n_embd
self
.
clf_token
=
clf_token
self
.
dropout
=
nn
.
Dropout
(
cfg
.
clf_pdrop
)
self
.
linear
=
nn
.
Linear
(
cfg
.
n_embd
,
1
)
nn
.
init
.
normal_
(
self
.
linear
.
weight
,
std
=
0.02
)
nn
.
init
.
normal_
(
self
.
linear
.
bias
,
0
)
def
forward
(
self
,
h
,
x
):
sim_h
=
h
.
view
(
-
1
,
self
.
n_embd
)
flat
=
x
[...,
0
].
contiguous
().
view
(
-
1
)
sim_h
=
sim_h
[
flat
==
self
.
clf_token
,
:]
sim_h
=
self
.
dropout
(
sim_h
)
sim_h
=
sim_h
.
sum
(
dim
=
1
)
sim_logits
=
self
.
linear
(
sim_h
)
return
sim_logits
class
DoubleHeadModel
(
nn
.
Module
):
""" Transformer with language model and task specific heads """
def
__init__
(
self
,
cfg
,
clf_token
,
task_head_type
,
vocab
=
40990
,
n_ctx
=
512
):
super
(
DoubleHeadModel
,
self
).
__init__
()
self
.
transformer
=
TransformerModel
(
cfg
,
vocab
=
vocab
,
n_ctx
=
n_ctx
)
self
.
lm_head
=
LMHead
(
self
.
transformer
,
cfg
)
if
isinstance
(
task_head_type
,
str
):
if
task_head_type
==
'multiple_choice'
:
self
.
task_head
=
MultipleChoiceHead
(
clf_token
,
cfg
)
elif
task_head_type
==
'similarity'
:
self
.
task_head
=
SimilarityHead
(
clf_token
,
cfg
)
elif
task_head_type
==
'inference'
:
# the three classes correspond to entailment, contradiction and neutral.
self
.
task_head
=
ClfHead
(
clf_token
,
cfg
,
3
)
else
:
raise
ValueError
(
"task_head_type is expected to be 'multiple_choice' "
"'similarity', 'inference' or ('classification', n_class) "
f
"got
{
task_head_type
}
."
)
elif
isinstance
(
task_head_type
,
collections
.
abc
.
Sequence
)
and
len
(
task_head_type
)
==
2
and
\
task_head_type
[
0
]
==
'classification'
:
n_class
=
task_head_type
[
1
]
self
.
task_head
=
ClfHead
(
clf_token
,
cfg
,
n_class
)
else
:
raise
ValueError
(
"task_head_type is expected to be 'multiple_choice' "
"'similarity', 'inference' or ('classification', n_class) "
f
"got
{
task_head_type
}
."
)
def
forward
(
self
,
x
):
h
=
self
.
transformer
(
x
)
lm_logits
=
self
.
lm_head
(
h
)
task_logits
=
self
.
task_head
(
h
,
x
)
return
lm_logits
,
task_logits
class
dotdict
(
dict
):
"""dot.notation access to dictionary attributes"""
__getattr__
=
dict
.
get
__setattr__
=
dict
.
__setitem__
__delattr__
=
dict
.
__delitem__
DEFAULT_CONFIG
=
dotdict
({
'n_embd'
:
768
,
'n_head'
:
12
,
'n_layer'
:
12
,
'embd_pdrop'
:
0.1
,
'attn_pdrop'
:
0.1
,
'resid_pdrop'
:
0.1
,
'afn'
:
'gelu'
,
'clf_pdrop'
:
0.1
})
pytorch_pretrained_bert/optimization_openai.py
0 → 100644
View file @
93f563b8
import
math
import
torch
from
torch.optim
import
Optimizer
from
torch.nn.utils
import
clip_grad_norm_
def
warmup_cosine
(
x
,
warmup
=
0.002
):
s
=
1
if
x
<=
warmup
else
0
return
s
*
(
x
/
warmup
)
+
(
1
-
s
)
*
(
0.5
*
(
1
+
torch
.
cos
(
math
.
pi
*
x
)))
def
warmup_constant
(
x
,
warmup
=
0.002
):
s
=
1
if
x
<=
warmup
else
0
return
s
*
(
x
/
warmup
)
+
(
1
-
s
)
*
1
def
warmup_linear
(
x
,
warmup
=
0.002
):
s
=
1
if
x
<=
warmup
else
0
return
(
s
*
(
x
/
warmup
)
+
(
1
-
s
))
*
(
1
-
x
)
SCHEDULES
=
{
'warmup_cosine'
:
warmup_cosine
,
'warmup_constant'
:
warmup_constant
,
'warmup_linear'
:
warmup_linear
,
}
class
OpenAIAdam
(
Optimizer
):
"""Implements Open AI version of Adam algorithm with weight decay fix.
"""
def
__init__
(
self
,
params
,
lr
,
schedule
,
warmup
,
t_total
,
b1
=
0.9
,
b2
=
0.999
,
e
=
1e-8
,
l2
=
0
,
vector_l2
=
False
,
max_grad_norm
=-
1
,
**
kwargs
):
if
not
0.0
<=
lr
:
raise
ValueError
(
"Invalid learning rate: {}"
.
format
(
lr
))
if
schedule
not
in
SCHEDULES
:
raise
ValueError
(
"Invalid schedule parameter: {}"
.
format
(
schedule
))
if
not
0
<=
warmup
:
raise
ValueError
(
"Invalid warmup: {}"
.
format
(
warmup
))
if
not
0.0
<=
b1
<
1.0
:
raise
ValueError
(
"Invalid b1 parameter: {}"
.
format
(
b1
))
if
not
0.0
<=
b2
<
1.0
:
raise
ValueError
(
"Invalid b2 parameter: {}"
.
format
(
b2
))
if
not
0.0
<=
e
:
raise
ValueError
(
"Invalid epsilon value: {}"
.
format
(
e
))
defaults
=
dict
(
lr
=
lr
,
schedule
=
schedule
,
warmup
=
warmup
,
t_total
=
t_total
,
b1
=
b1
,
b2
=
b2
,
e
=
e
,
l2
=
l2
,
vector_l2
=
vector_l2
,
max_grad_norm
=
max_grad_norm
)
super
(
OpenAIAdam
,
self
).
__init__
(
params
,
defaults
)
def
step
(
self
,
closure
=
None
):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss
=
None
if
closure
is
not
None
:
loss
=
closure
()
for
group
in
self
.
param_groups
:
for
p
in
group
[
'params'
]:
if
p
.
grad
is
None
:
continue
grad
=
p
.
grad
.
data
if
grad
.
is_sparse
:
raise
RuntimeError
(
'Adam does not support sparse gradients, please consider SparseAdam instead'
)
state
=
self
.
state
[
p
]
# State initialization
if
len
(
state
)
==
0
:
state
[
'step'
]
=
0
# Exponential moving average of gradient values
state
[
'exp_avg'
]
=
torch
.
zeros_like
(
p
.
data
)
# Exponential moving average of squared gradient values
state
[
'exp_avg_sq'
]
=
torch
.
zeros_like
(
p
.
data
)
exp_avg
,
exp_avg_sq
=
state
[
'exp_avg'
],
state
[
'exp_avg_sq'
]
beta1
,
beta2
=
group
[
'b1'
],
group
[
'b2'
]
state
[
'step'
]
+=
1
# Add grad clipping
if
group
[
'max_grad_norm'
]
>
0
:
clip_grad_norm_
(
p
,
group
[
'max_grad_norm'
])
# Decay the first and second moment running average coefficient
exp_avg
.
mul_
(
beta1
).
add_
(
1
-
beta1
,
grad
)
exp_avg_sq
.
mul_
(
beta2
).
addcmul_
(
1
-
beta2
,
grad
,
grad
)
denom
=
exp_avg_sq
.
sqrt
().
add_
(
group
[
'e'
])
bias_correction1
=
1
-
beta1
**
state
[
'step'
]
bias_correction2
=
1
-
beta2
**
state
[
'step'
]
schedule_fct
=
SCHEDULES
[
group
[
'schedule'
]]
lr_scheduled
=
group
[
'lr'
]
*
schedule_fct
(
state
[
'step'
]
/
group
[
't_total'
],
group
[
'warmup'
])
step_size
=
lr_scheduled
*
math
.
sqrt
(
bias_correction2
)
/
bias_correction1
p
.
data
.
addcdiv_
(
-
step_size
,
exp_avg
,
denom
)
# Add weight decay at the end (fixed version)
if
(
len
(
p
.
size
())
>
1
or
group
[
'vector_l2'
])
and
group
[
'l2'
]
>
0
:
p
.
data
.
add_
(
-
lr_scheduled
*
group
[
'l2'
],
p
.
data
)
return
loss
pytorch_pretrained_bert/tokenization_openai.py
0 → 100644
View file @
93f563b8
import
re
import
ftfy
import
json
import
spacy
from
tqdm
import
tqdm
def
get_pairs
(
word
):
"""
Return set of symbol pairs in a word.
word is represented as tuple of symbols (symbols being variable-length strings)
"""
pairs
=
set
()
prev_char
=
word
[
0
]
for
char
in
word
[
1
:]:
pairs
.
add
((
prev_char
,
char
))
prev_char
=
char
return
pairs
def
text_standardize
(
text
):
"""
fixes some issues the spacy tokenizer had on books corpus
also does some whitespace standardization
"""
text
=
text
.
replace
(
'—'
,
'-'
)
text
=
text
.
replace
(
'–'
,
'-'
)
text
=
text
.
replace
(
'―'
,
'-'
)
text
=
text
.
replace
(
'…'
,
'...'
)
text
=
text
.
replace
(
'´'
,
"'"
)
text
=
re
.
sub
(
r
'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)'''
,
r
' \1 '
,
text
)
text
=
re
.
sub
(
r
'\s*\n\s*'
,
'
\n
'
,
text
)
text
=
re
.
sub
(
r
'[^\S\n]+'
,
' '
,
text
)
return
text
.
strip
()
class
TextEncoder
(
object
):
"""
mostly a wrapper for a public python bpe tokenizer
"""
def
__init__
(
self
,
encoder_path
,
bpe_path
):
self
.
nlp
=
spacy
.
load
(
'en'
,
disable
=
[
'parser'
,
'tagger'
,
'ner'
,
'textcat'
])
self
.
encoder
=
json
.
load
(
open
(
encoder_path
))
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
merges
=
open
(
bpe_path
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[
1
:
-
1
]
merges
=
[
tuple
(
merge
.
split
())
for
merge
in
merges
]
self
.
bpe_ranks
=
dict
(
zip
(
merges
,
range
(
len
(
merges
))))
self
.
cache
=
{}
def
bpe
(
self
,
token
):
word
=
tuple
(
token
[:
-
1
])
+
(
token
[
-
1
]
+
'</w>'
,)
if
token
in
self
.
cache
:
return
self
.
cache
[
token
]
pairs
=
get_pairs
(
word
)
if
not
pairs
:
return
token
+
'</w>'
while
True
:
bigram
=
min
(
pairs
,
key
=
lambda
pair
:
self
.
bpe_ranks
.
get
(
pair
,
float
(
'inf'
)))
if
bigram
not
in
self
.
bpe_ranks
:
break
first
,
second
=
bigram
new_word
=
[]
i
=
0
while
i
<
len
(
word
):
try
:
j
=
word
.
index
(
first
,
i
)
new_word
.
extend
(
word
[
i
:
j
])
i
=
j
except
:
new_word
.
extend
(
word
[
i
:])
break
if
word
[
i
]
==
first
and
i
<
len
(
word
)
-
1
and
word
[
i
+
1
]
==
second
:
new_word
.
append
(
first
+
second
)
i
+=
2
else
:
new_word
.
append
(
word
[
i
])
i
+=
1
new_word
=
tuple
(
new_word
)
word
=
new_word
if
len
(
word
)
==
1
:
break
else
:
pairs
=
get_pairs
(
word
)
word
=
' '
.
join
(
word
)
if
word
==
'
\n
</w>'
:
word
=
'
\n
</w>'
self
.
cache
[
token
]
=
word
return
word
def
encode
(
self
,
texts
,
verbose
=
True
):
texts_tokens
=
[]
if
verbose
:
for
text
in
tqdm
(
texts
,
ncols
=
80
,
leave
=
False
):
text
=
self
.
nlp
(
text_standardize
(
ftfy
.
fix_text
(
text
)))
text_tokens
=
[]
for
token
in
text
:
text_tokens
.
extend
([
self
.
encoder
.
get
(
t
,
0
)
for
t
in
self
.
bpe
(
token
.
text
.
lower
()).
split
(
' '
)])
texts_tokens
.
append
(
text_tokens
)
else
:
for
text
in
texts
:
text
=
self
.
nlp
(
text_standardize
(
ftfy
.
fix_text
(
text
)))
text_tokens
=
[]
for
token
in
text
:
text_tokens
.
extend
([
self
.
encoder
.
get
(
t
,
0
)
for
t
in
self
.
bpe
(
token
.
text
.
lower
()).
split
(
' '
)])
texts_tokens
.
append
(
text_tokens
)
return
texts_tokens
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment