Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
ccce66be
Commit
ccce66be
authored
Oct 30, 2018
by
thomwolf
Browse files
getting ready
parent
43badf21
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
595 additions
and
104 deletions
+595
-104
.gitignore
.gitignore
+2
-104
bert_model.py
bert_model.py
+481
-0
data_processor.py
data_processor.py
+89
-0
download_weights.sh
download_weights.sh
+5
-0
example.py
example.py
+18
-0
No files found.
.gitignore
View file @
ccce66be
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
# VSCode
.vscode
\ No newline at end of file
bert_model.py
0 → 100644
View file @
ccce66be
"""
A PyTorch implementation of Google's BERT Model.
From "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding"
By Jacob Devlin, Ming-Wei Chang, Kenton Lee, Kristina Toutanova
Link: http://arxiv.org/abs/1810.04805
Adapted from HuggingFace's OpenAI PyTorch code and its adaptation by AllenNLP.
"""
# pylint: disable=invalid-name,arguments-differ
from
typing
import
NamedTuple
,
List
import
copy
import
io
import
json
import
logging
import
math
import
pathlib
import
re
import
tarfile
import
numpy
as
np
import
torch
from
torch.nn
import
Parameter
# pylint: disable=line-too-long
_PARAMETER_NAMES
=
[
"model/we:0"
,
"model/h0/attn/c_attn/w:0"
,
"model/h0/attn/c_attn/b:0"
,
"model/h0/attn/c_proj/w:0"
,
"model/h0/attn/c_proj/b:0"
,
"model/h0/ln_1/g:0"
,
"model/h0/ln_1/b:0"
,
"model/h0/mlp/c_fc/w:0"
,
"model/h0/mlp/c_fc/b:0"
,
"model/h0/mlp/c_proj/w:0"
,
"model/h0/mlp/c_proj/b:0"
,
"model/h0/ln_2/g:0"
,
"model/h0/ln_2/b:0"
,
"model/h1/attn/c_attn/w:0"
,
"model/h1/attn/c_attn/b:0"
,
"model/h1/attn/c_proj/w:0"
,
"model/h1/attn/c_proj/b:0"
,
"model/h1/ln_1/g:0"
,
"model/h1/ln_1/b:0"
,
"model/h1/mlp/c_fc/w:0"
,
"model/h1/mlp/c_fc/b:0"
,
"model/h1/mlp/c_proj/w:0"
,
"model/h1/mlp/c_proj/b:0"
,
"model/h1/ln_2/g:0"
,
"model/h1/ln_2/b:0"
,
"model/h2/attn/c_attn/w:0"
,
"model/h2/attn/c_attn/b:0"
,
"model/h2/attn/c_proj/w:0"
,
"model/h2/attn/c_proj/b:0"
,
"model/h2/ln_1/g:0"
,
"model/h2/ln_1/b:0"
,
"model/h2/mlp/c_fc/w:0"
,
"model/h2/mlp/c_fc/b:0"
,
"model/h2/mlp/c_proj/w:0"
,
"model/h2/mlp/c_proj/b:0"
,
"model/h2/ln_2/g:0"
,
"model/h2/ln_2/b:0"
,
"model/h3/attn/c_attn/w:0"
,
"model/h3/attn/c_attn/b:0"
,
"model/h3/attn/c_proj/w:0"
,
"model/h3/attn/c_proj/b:0"
,
"model/h3/ln_1/g:0"
,
"model/h3/ln_1/b:0"
,
"model/h3/mlp/c_fc/w:0"
,
"model/h3/mlp/c_fc/b:0"
,
"model/h3/mlp/c_proj/w:0"
,
"model/h3/mlp/c_proj/b:0"
,
"model/h3/ln_2/g:0"
,
"model/h3/ln_2/b:0"
,
"model/h4/attn/c_attn/w:0"
,
"model/h4/attn/c_attn/b:0"
,
"model/h4/attn/c_proj/w:0"
,
"model/h4/attn/c_proj/b:0"
,
"model/h4/ln_1/g:0"
,
"model/h4/ln_1/b:0"
,
"model/h4/mlp/c_fc/w:0"
,
"model/h4/mlp/c_fc/b:0"
,
"model/h4/mlp/c_proj/w:0"
,
"model/h4/mlp/c_proj/b:0"
,
"model/h4/ln_2/g:0"
,
"model/h4/ln_2/b:0"
,
"model/h5/attn/c_attn/w:0"
,
"model/h5/attn/c_attn/b:0"
,
"model/h5/attn/c_proj/w:0"
,
"model/h5/attn/c_proj/b:0"
,
"model/h5/ln_1/g:0"
,
"model/h5/ln_1/b:0"
,
"model/h5/mlp/c_fc/w:0"
,
"model/h5/mlp/c_fc/b:0"
,
"model/h5/mlp/c_proj/w:0"
,
"model/h5/mlp/c_proj/b:0"
,
"model/h5/ln_2/g:0"
,
"model/h5/ln_2/b:0"
,
"model/h6/attn/c_attn/w:0"
,
"model/h6/attn/c_attn/b:0"
,
"model/h6/attn/c_proj/w:0"
,
"model/h6/attn/c_proj/b:0"
,
"model/h6/ln_1/g:0"
,
"model/h6/ln_1/b:0"
,
"model/h6/mlp/c_fc/w:0"
,
"model/h6/mlp/c_fc/b:0"
,
"model/h6/mlp/c_proj/w:0"
,
"model/h6/mlp/c_proj/b:0"
,
"model/h6/ln_2/g:0"
,
"model/h6/ln_2/b:0"
,
"model/h7/attn/c_attn/w:0"
,
"model/h7/attn/c_attn/b:0"
,
"model/h7/attn/c_proj/w:0"
,
"model/h7/attn/c_proj/b:0"
,
"model/h7/ln_1/g:0"
,
"model/h7/ln_1/b:0"
,
"model/h7/mlp/c_fc/w:0"
,
"model/h7/mlp/c_fc/b:0"
,
"model/h7/mlp/c_proj/w:0"
,
"model/h7/mlp/c_proj/b:0"
,
"model/h7/ln_2/g:0"
,
"model/h7/ln_2/b:0"
,
"model/h8/attn/c_attn/w:0"
,
"model/h8/attn/c_attn/b:0"
,
"model/h8/attn/c_proj/w:0"
,
"model/h8/attn/c_proj/b:0"
,
"model/h8/ln_1/g:0"
,
"model/h8/ln_1/b:0"
,
"model/h8/mlp/c_fc/w:0"
,
"model/h8/mlp/c_fc/b:0"
,
"model/h8/mlp/c_proj/w:0"
,
"model/h8/mlp/c_proj/b:0"
,
"model/h8/ln_2/g:0"
,
"model/h8/ln_2/b:0"
,
"model/h9/attn/c_attn/w:0"
,
"model/h9/attn/c_attn/b:0"
,
"model/h9/attn/c_proj/w:0"
,
"model/h9/attn/c_proj/b:0"
,
"model/h9/ln_1/g:0"
,
"model/h9/ln_1/b:0"
,
"model/h9/mlp/c_fc/w:0"
,
"model/h9/mlp/c_fc/b:0"
,
"model/h9/mlp/c_proj/w:0"
,
"model/h9/mlp/c_proj/b:0"
,
"model/h9/ln_2/g:0"
,
"model/h9/ln_2/b:0"
,
"model/h10/attn/c_attn/w:0"
,
"model/h10/attn/c_attn/b:0"
,
"model/h10/attn/c_proj/w:0"
,
"model/h10/attn/c_proj/b:0"
,
"model/h10/ln_1/g:0"
,
"model/h10/ln_1/b:0"
,
"model/h10/mlp/c_fc/w:0"
,
"model/h10/mlp/c_fc/b:0"
,
"model/h10/mlp/c_proj/w:0"
,
"model/h10/mlp/c_proj/b:0"
,
"model/h10/ln_2/g:0"
,
"model/h10/ln_2/b:0"
,
"model/h11/attn/c_attn/w:0"
,
"model/h11/attn/c_attn/b:0"
,
"model/h11/attn/c_proj/w:0"
,
"model/h11/attn/c_proj/b:0"
,
"model/h11/ln_1/g:0"
,
"model/h11/ln_1/b:0"
,
"model/h11/mlp/c_fc/w:0"
,
"model/h11/mlp/c_fc/b:0"
,
"model/h11/mlp/c_proj/w:0"
,
"model/h11/mlp/c_proj/b:0"
,
"model/h11/ln_2/g:0"
,
"model/h11/ln_2/b:0"
,
"model/clf/w:0"
,
"model/clf/b:0"
]
# pylint: enable=line-too-long
def
gelu
(
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
0.5
*
x
*
(
1
+
torch
.
tanh
(
math
.
sqrt
(
2
/
math
.
pi
)
*
(
x
+
0.044715
*
torch
.
pow
(
x
,
3
))))
class
BERTConfig
(
NamedTuple
):
"""
BERT's hyper-parameters
"""
embedding_dim
:
int
=
768
num_heads
:
int
=
12
dropout
:
float
=
0.1
class
LayerNorm
(
torch
.
nn
.
Module
):
"""
A layernorm module in the Tensorflow style (with the epsilon inside the square root).
"""
def
__init__
(
self
,
n_state
,
e
=
1e-5
):
super
().
__init__
()
self
.
g
=
torch
.
nn
.
Parameter
(
torch
.
ones
(
n_state
))
self
.
b
=
torch
.
nn
.
Parameter
(
torch
.
zeros
(
n_state
))
self
.
e
=
e
def
forward
(
self
,
x
):
u
=
x
.
mean
(
-
1
,
keepdim
=
True
)
s
=
(
x
-
u
).
pow
(
2
).
mean
(
-
1
,
keepdim
=
True
)
x
=
(
x
-
u
)
/
torch
.
sqrt
(
s
+
self
.
e
)
return
self
.
g
*
x
+
self
.
b
class
Conv1D
(
torch
.
nn
.
Module
):
"""
A batched linear layer using torch.addmm
"""
def
__init__
(
self
,
nf
:
int
,
rf
:
int
,
nx
:
int
)
->
None
:
super
().
__init__
()
self
.
rf
=
rf
self
.
nf
=
nf
w
=
torch
.
empty
(
nx
,
nf
)
torch
.
nn
.
init
.
normal_
(
w
,
std
=
0.02
)
self
.
w
=
Parameter
(
w
)
self
.
b
=
Parameter
(
torch
.
zeros
(
nf
))
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
size_out
=
x
.
size
()[:
-
1
]
+
(
self
.
nf
,)
x
=
torch
.
addmm
(
self
.
b
,
x
.
view
(
-
1
,
x
.
size
(
-
1
)),
self
.
w
)
x
=
x
.
view
(
*
size_out
)
return
x
class
Attention
(
torch
.
nn
.
Module
):
"""
A self-attention layer comprising a sequence of:
- a linear layer: instance of the `Conv1D` class,
- spliting the inputs in key, value, query tensors (x.split),
- reshaping key, value, query tensors according to the number of head (self.split_heads)
- appying self attention (self._attn)
- merging back the heads results (self.merge_heads)
- a linear layer: instance of the `Conv1D` class,
- a dropout layer: instance of `torch.nn.Dropout` class.
See above for the details of Conv1D.
"""
def
__init__
(
self
,
nx
:
int
,
n_ctx
:
int
,
config
:
BERTConfig
,
scale
:
bool
=
False
)
->
None
:
super
().
__init__
()
n_state
=
nx
# in Attention: n_state=768 (nx=n_embd)
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
assert
n_state
%
config
.
num_heads
==
0
self
.
register_buffer
(
'b'
,
torch
.
tril
(
torch
.
ones
(
n_ctx
,
n_ctx
)).
view
(
1
,
1
,
n_ctx
,
n_ctx
))
self
.
n_head
=
config
.
num_heads
self
.
split_size
=
n_state
self
.
scale
=
scale
self
.
c_attn
=
Conv1D
(
n_state
*
3
,
1
,
nx
)
self
.
c_proj
=
Conv1D
(
n_state
,
1
,
nx
)
self
.
attn_dropout
=
torch
.
nn
.
Dropout
(
config
.
dropout
)
self
.
resid_dropout
=
torch
.
nn
.
Dropout
(
config
.
dropout
)
def
_attn
(
self
,
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
)
->
torch
.
Tensor
:
w
=
torch
.
matmul
(
q
,
k
)
if
self
.
scale
:
w
=
w
/
math
.
sqrt
(
v
.
size
(
-
1
))
w
=
w
*
self
.
b
+
-
1e9
*
(
1
-
self
.
b
)
# TF implem method: mask_attn_weights
w
=
torch
.
nn
.
Softmax
(
dim
=-
1
)(
w
)
w
=
self
.
attn_dropout
(
w
)
return
torch
.
matmul
(
w
,
v
)
def
merge_heads
(
self
,
x
:
torch
.
Tensor
):
# pylint: disable=no-self-use
x
=
x
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
new_x_shape
=
x
.
size
()[:
-
2
]
+
(
x
.
size
(
-
2
)
*
x
.
size
(
-
1
),)
return
x
.
view
(
*
new_x_shape
)
# in Tensorflow implem: fct merge_states
def
split_heads
(
self
,
x
:
torch
.
Tensor
,
k
:
bool
=
False
):
new_x_shape
=
x
.
size
()[:
-
1
]
+
(
self
.
n_head
,
x
.
size
(
-
1
)
//
self
.
n_head
)
x
=
x
.
view
(
*
new_x_shape
)
# in Tensorflow implem: fct split_states
if
k
:
return
x
.
permute
(
0
,
2
,
3
,
1
)
else
:
return
x
.
permute
(
0
,
2
,
1
,
3
)
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
x
=
self
.
c_attn
(
x
)
query
,
key
,
value
=
x
.
split
(
self
.
split_size
,
dim
=
2
)
query
=
self
.
split_heads
(
query
)
key
=
self
.
split_heads
(
key
,
k
=
True
)
value
=
self
.
split_heads
(
value
)
a
=
self
.
_attn
(
query
,
key
,
value
)
a
=
self
.
merge_heads
(
a
)
a
=
self
.
c_proj
(
a
)
a
=
self
.
resid_dropout
(
a
)
return
a
class
MLP
(
torch
.
nn
.
Module
):
"""
A multi-layer perceptron layer comprising a sequence of:
- a linear layer: instance of the `Conv1D` class,
- an activation function: the `gelu` function,
- another linear layer: instance of the `Conv1D` class,
- a dropout layer: instance of `torch.nn.Dropout` class.
See above for the details of Conv1D and the gelu function.
"""
def
__init__
(
self
,
n_state
:
int
,
config
:
BERTConfig
)
->
None
:
# in MLP: n_state=3072 (4 * n_embd)
super
().
__init__
()
self
.
c_fc
=
Conv1D
(
n_state
,
1
,
config
.
embedding_dim
)
self
.
c_proj
=
Conv1D
(
config
.
embedding_dim
,
1
,
n_state
)
self
.
act
=
gelu
self
.
dropout
=
torch
.
nn
.
Dropout
(
config
.
dropout
)
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
h
=
self
.
act
(
self
.
c_fc
(
x
))
h2
=
self
.
c_proj
(
h
)
return
self
.
dropout
(
h2
)
class
Block
(
torch
.
nn
.
Module
):
"""
A Transformer Block comprising a sequence of:
- a self-attention layer: instance of the `Attention` class,
- a Layer Normalization layer: instance of the `LayerNorm` class,
- a Multi-layer perceptron layer: instance of the `MLP` class,
- another Layer Normalization layer: instance of the `LayerNorm` class.
See above for the details of these classes.
"""
def
__init__
(
self
,
n_ctx
:
int
,
config
:
BERTConfig
,
scale
:
bool
=
False
)
->
None
:
super
().
__init__
()
nx
=
config
.
embedding_dim
self
.
attn
=
Attention
(
nx
,
n_ctx
,
config
,
scale
)
self
.
ln_1
=
LayerNorm
(
nx
)
self
.
mlp
=
MLP
(
4
*
nx
,
config
)
self
.
ln_2
=
LayerNorm
(
nx
)
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
a
=
self
.
attn
(
x
)
n
=
self
.
ln_1
(
x
+
a
)
m
=
self
.
mlp
(
n
)
h
=
self
.
ln_2
(
n
+
m
)
return
h
class
BERT
(
torch
.
nn
.
Module
):
"""
Google's BERT Model.
Default parameters are the ones for Google's pretrained model.
Parameters
----------
vocab_size: ``int`` (optional, default: 40478)
The size of the vocabulary (number of byte pair embeddings)
excluding the n_special embeddings (if any), and the positional embeddings.
n_ctx: ``int`` (optional, default: 512)
The number of positional encodings to use for evaluation.
embedding_dim: ``int`` (optional, default: 768)
The dimension of the output embeddings.
num_heads: ``int`` (optional, default: 12)
How many "heads" the attention has.
num_layers: ``int`` (optional, default: 12)
How many layers of "blocks" the transformer has.
dropout_probability: ``float`` (optional, default: 0.1)
Dropout for all layers.
model_path: ``str`` (optional, default: ``None``)
A tar.gz file containing serialized model weights. If supplied,
the weights will be loaded from that file.
requires_grad: ``bool`` (optional, default: ``False``)
If true, the transformer will be fine-tuneable.
n_special: ``int`` (optional, default: ``-1``)
The number of special tokens added to the byte pair vocabulary
"""
def
__init__
(
self
,
vocab_size
:
int
=
40478
,
n_ctx
:
int
=
512
,
embedding_dim
:
int
=
768
,
num_heads
:
int
=
12
,
num_layers
:
int
=
12
,
dropout_probability
:
float
=
0.1
,
model_path
:
str
=
None
,
requires_grad
:
bool
=
False
,
n_special
:
int
=
-
1
)
->
None
:
super
().
__init__
()
config
=
BERTConfig
(
embedding_dim
,
num_heads
,
embedding_dropout_probability
,
attention_dropout_probability
,
residual_dropout_probability
,
activation_function
,
)
# the embedding size is vocab_size + n_special embeddings + n_ctx
embedding_size
=
vocab_size
+
max
(
n_special
,
0
)
+
n_ctx
self
.
vocab_size
=
embedding_size
self
.
n_ctx
=
n_ctx
self
.
n_special
=
n_special
self
.
num_output_layers
=
1
+
num_layers
self
.
embed
=
torch
.
nn
.
Embedding
(
embedding_size
,
embedding_dim
)
self
.
drop
=
torch
.
nn
.
Dropout
(
embedding_dropout_probability
)
block
=
Block
(
n_ctx
,
config
,
scale
=
True
)
self
.
h
=
torch
.
nn
.
ModuleList
([
copy
.
deepcopy
(
block
)
for
_
in
range
(
num_layers
)])
self
.
decoder
=
torch
.
nn
.
Linear
(
embedding_dim
,
embedding_size
,
bias
=
False
)
self
.
decoder
.
weight
=
self
.
embed
.
weight
# Tied weights
# To reproduce the noise_shape parameter of TF implementation
torch
.
nn
.
init
.
normal_
(
self
.
embed
.
weight
,
std
=
0.02
)
for
parameter
in
self
.
parameters
():
parameter
.
requires_grad
=
requires_grad
if
model_path
:
self
.
load_weights
(
model_path
,
n_special
=
n_special
,
n_ctx
=
n_ctx
)
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
List
[
torch
.
Tensor
]:
#x = x.view(-1, x.size(2), x.size(3))
# x is (batch_size, sequence_length) tensor of byte-pair ids
# e is (batch_size, sequence_length, 2, embedding_dim) tensor of embeddings
e
=
self
.
embed
(
x
)
# h is (batch_size, sequence_length, embedding_dim)
h
=
e
.
sum
(
dim
=
2
)
all_layers
=
[
h
]
for
block
in
self
.
h
:
h
=
block
(
h
)
all_layers
.
append
(
h
)
# result is list of (batch_size, sequence_length, embedding_dim)
return
all_layers
def
load_weights
(
self
,
bert_model_path
:
str
,
n_ctx
:
int
=
-
1
,
n_special
:
int
=
-
1
,
n_transfer
:
int
=
12
,
n_embd
:
int
=
768
,
names
:
List
[
str
]
=
_PARAMETER_NAMES
)
->
None
:
# pylint: disable=dangerous-default-value
logger
.
info
(
f
"loading weights from
{
bert_model_path
}
"
)
# if `file_path` is a URL, redirect to the cache
with
tarfile
.
open
(
bert_model_path
)
as
tmp
:
num_params_files
=
len
([
member
for
member
in
tmp
.
getmembers
()
if
member
.
name
.
endswith
(
'.npy'
)])
shapesfile
=
tmp
.
extractfile
(
'model/params_shapes.json'
)
if
shapesfile
:
shapes
=
json
.
loads
(
shapesfile
.
read
())
else
:
raise
ConfigurationError
(
"unable to find model/params_shapes.json in the archive"
)
# numpy can't read from a tarfile directly, so we need a workaround
# https://github.com/numpy/numpy/issues/7989#issuecomment-341656702
init_params
:
List
[
np
.
ndarray
]
=
[]
for
n
in
range
(
num_params_files
):
array_file
=
io
.
BytesIO
()
array_file
.
write
(
tmp
.
extractfile
(
f
'model/params_
{
n
}
.npy'
).
read
())
array_file
.
seek
(
0
)
# each np.load is a (11653478,) numpy array
init_params
.
append
(
np
.
load
(
array_file
))
# init_params is a list of 10 arrays of size (11653578,)
# shapes are [[512, 768], [40478, 768], [1, 768, 2304], [2304], ... # 146 elts
# products are [512 * 768, 40478 * 768, ...]
# offsets is [512 * 768, 512 * 768 + 40478 * 768, ...]
offsets
=
np
.
cumsum
([
np
.
prod
(
shape
)
for
shape
in
shapes
])
# split into the 146 subarrays corresponding to shapes
init_params
=
np
.
split
(
np
.
concatenate
(
init_params
,
0
),
offsets
)[:
-
1
]
# reshape
init_params
=
[
param
.
reshape
(
shape
)
for
param
,
shape
in
zip
(
init_params
,
shapes
)]
# truncate if necessary
if
n_ctx
>
0
:
# positional embeddings?
# init_params[0] is (512, 768) = (max_chars, embedding_dim)
init_params
[
0
]
=
init_params
[
0
][:
n_ctx
]
# combine init_params[1] and init_params[0]
if
n_special
>
0
:
# init_params[1] is (40478, 768)
# special is (n_special, 768)
# init_params[0] is (512, 768)
# result is (40990 + n_special, 768)
init_params
[
0
]
=
np
.
concatenate
(
[
init_params
[
1
],
(
np
.
random
.
randn
(
n_special
,
n_embd
)
*
0.02
).
astype
(
np
.
float32
),
init_params
[
0
]],
0
)
else
:
# result is (40990, 768)
init_params
[
0
]
=
np
.
concatenate
([
init_params
[
1
],
init_params
[
0
]],
0
)
del
init_params
[
1
]
# number of dimensions to transfer, 12 per layer, plus one extra
if
n_transfer
==
-
1
:
n_transfer
=
0
else
:
n_transfer
=
1
+
n_transfer
*
12
# squeeze?
init_params
=
[
arr
.
squeeze
()
for
arr
in
init_params
]
# embedding.weight is (vocab_size, embedding_dim)
# make sure init_params[0] has the same shape
try
:
assert
self
.
embed
.
weight
.
shape
==
init_params
[
0
].
shape
except
AssertionError
as
e
:
e
.
args
+=
(
self
.
embed
.
weight
.
shape
,
init_params
[
0
].
shape
)
raise
# and then assign it
self
.
embed
.
weight
.
data
=
torch
.
from_numpy
(
init_params
[
0
])
self
.
decoder
.
weight
=
self
.
embed
.
weight
# for each (name, array) pair to transfer over
for
name
,
ip
in
zip
(
names
[
1
:
n_transfer
],
init_params
[
1
:
n_transfer
]):
# "model/h0/attn/c_attn/w:0"
name
=
name
[
6
:]
# "h0/attn/c_attn/w:0"
assert
name
[
-
2
:]
==
":0"
name
=
name
[:
-
2
]
# "h0/attn/c_attn/w"
name_parts
=
name
.
split
(
'/'
)
# ['h0', 'attn', 'c_attn', 'w']
pointer
=
self
for
m_name
in
name_parts
:
if
re
.
fullmatch
(
r
'[A-Za-z]+\d+'
,
m_name
):
l
=
re
.
split
(
r
'(\d+)'
,
m_name
)
# ['h', '0', '']
else
:
l
=
[
m_name
]
# ['attn']
pointer
=
getattr
(
pointer
,
l
[
0
])
if
len
(
l
)
>=
2
:
num
=
int
(
l
[
1
])
pointer
=
pointer
[
num
]
try
:
assert
pointer
.
shape
==
ip
.
shape
except
AssertionError
as
e
:
e
.
args
+=
(
pointer
.
shape
,
ip
.
shape
)
raise
pointer
.
data
=
torch
.
from_numpy
(
ip
)
# pylint: disable=attribute-defined-outside-init
def
dump_weights
(
self
,
output_dir
:
str
,
num_pieces
:
int
=
10
)
->
None
:
output_path
=
pathlib
.
Path
(
output_dir
)
/
'model'
output_path
.
mkdir
(
exist_ok
=
True
,
parents
=
True
)
# pylint: disable=no-member
named_parameters
=
list
(
self
.
named_parameters
())
# embedding weights get special treatment
_
,
array
=
named_parameters
[
0
]
num_bpe
=
self
.
vocab_size
-
self
.
n_ctx
byte_pair_embeddings
=
array
[:
num_bpe
]
positional_embeddings
=
array
[
num_bpe
:]
arrays
=
[
positional_embeddings
.
numpy
().
ravel
(),
byte_pair_embeddings
.
numpy
().
ravel
()]
shapes
=
[
positional_embeddings
.
shape
,
byte_pair_embeddings
.
shape
]
names
=
[
"model/we:0"
]
for
param_name
,
tensor
in
named_parameters
[
1
:]:
param_name
=
f
'h
{
param_name
}
'
# 'h0.attn.c_attn.w'
parts
=
param_name
.
split
(
"."
)
# ['h0', 'attn', 'c_attn', 'w']
name
=
"model/"
+
'/'
.
join
(
parts
)
+
':0'
# 'model/h0/attn/c_attn/w:0'
array
=
tensor
.
numpy
().
ravel
()
arrays
.
append
(
array
)
shapes
.
append
(
list
(
tensor
.
shape
))
names
.
append
(
name
)
# write out the arrays
big_array
=
np
.
concatenate
(
arrays
)
total_size
=
len
(
big_array
)
batch_size
=
math
.
ceil
(
total_size
/
num_pieces
)
for
i
in
range
(
num_pieces
):
filename
=
output_path
/
f
"params_
{
i
}
.npy"
start
=
i
*
batch_size
end
=
start
+
batch_size
subarray
=
big_array
[
start
:
end
]
np
.
save
(
filename
,
subarray
)
# write out the shapes
with
open
(
output_path
/
'params_shapes.json'
,
'w'
)
as
shapes_file
:
json
.
dump
(
shapes
,
shapes_file
)
data_processor.py
0 → 100644
View file @
ccce66be
"""
Prepare input data for Google's BERT Model.
Contains some functions from tensor2tensor library: https://github.com/tensorflow/tensor2tensor
"""
from
typing
import
NamedTuple
,
List
,
Union
,
Tuple
TokenizedSentence
=
List
[
str
]
TokenizedInput
=
Union
[
Tuple
[
TokenizedSentence
,
TokenizedSentence
],
TokenizedSentence
]
class
DataProcessor
():
def
__init__
(
self
,
vocab_path
):
self
.
encoder_file_path
=
encoder_file_path
self
.
token_indexer
=
json
.
load
(
open
(
vocab_path
))
def
tokenize
(
text
):
"""Encode a unicode string as a list of tokens.
Args:
text: a unicode string
Returns:
a list of tokens as Unicode strings
"""
if
not
text
:
return
[]
ret
=
[]
token_start
=
0
# Classify each character in the input string
is_alnum
=
[
c
in
_ALPHANUMERIC_CHAR_SET
for
c
in
text
]
for
pos
in
range
(
1
,
len
(
text
)):
if
is_alnum
[
pos
]
!=
is_alnum
[
pos
-
1
]:
token
=
text
[
token_start
:
pos
]
if
token
!=
u
" "
or
token_start
==
0
:
ret
.
append
(
token
)
token_start
=
pos
final_token
=
text
[
token_start
:]
ret
.
append
(
final_token
)
return
ret
def
detokenize
(
tokens
):
"""Decode a list of tokens to a unicode string.
Args:
tokens: a list of Unicode strings
Returns:
a unicode string
"""
token_is_alnum
=
[
t
[
0
]
in
_ALPHANUMERIC_CHAR_SET
for
t
in
tokens
]
ret
=
[]
for
i
,
token
in
enumerate
(
tokens
):
if
i
>
0
and
token_is_alnum
[
i
-
1
]
and
token_is_alnum
[
i
]:
ret
.
append
(
u
" "
)
ret
.
append
(
token
)
return
""
.
join
(
ret
)
def
encode
(
input_sentences
:
List
[
TokenizedInput
])
->
np
.
array
:
""" Prepare a torch.Tensor of inputs for BERT model from a string.
Args:
input_sentences: list of
- pairs of tokenized sentences (sentence_A, sentence_B) or
- tokenized sentences (will be considered as sentence_A only)
Return:
Numpy array of formated inputs for BERT model
"""
batch_size
=
sum
(
min
(
len
(
x
),
n_perso_permute
)
for
x
in
X1
)
input_mask
=
np
.
zeros
((
n_batch
,
n_cands
,
n_ctx
),
dtype
=
np
.
float32
)
input_array
=
np
.
zeros
((
n_batch
,
n_cands
,
n_ctx
,
3
),
dtype
=
np
.
int32
)
i
=
0
for
tokenized_input
in
input_sentences
:
x1j
,
lxj
,
lperso
,
lhisto
,
dialog_embed
=
format_transformer_input
(
x1
,
x2
,
xcand_j
,
text_encoder
,
dialog_embed_mode
,
max_len
=
max_len
,
add_start_stop
=
True
)
lmj
=
len
(
xcand_j
[:
max_len
])
+
1
xmb
[
i
,
j
,
:
lxj
,
0
]
=
x1j
if
dialog_embed_mode
==
1
or
dialog_embed_mode
==
2
:
xmb
[
i
,
j
,
:
lxj
,
2
]
=
dialog_embed
mmb
[
i
,
j
,
:
lxj
]
=
1
if
fix_lm_index
:
# Take one before so we don't predict from classify token...
mmb_eval
[
i
,
j
,
(
lxj
-
lmj
-
1
):
lxj
-
1
]
=
1
# This one only mask the response so we get the perplexity on the response only
else
:
mmb_eval
[
i
,
j
,
(
lxj
-
lmj
):
lxj
]
=
1
# This one only mask the response so we get the perplexity on the response only
xmb
[
i
,
j
,
:,
1
]
=
np
.
arange
(
n_vocab
+
n_special
,
n_vocab
+
n_special
+
n_ctx
)
i
+=
1
return
input_array
,
input_mask
\ No newline at end of file
download_weights.sh
0 → 100644
View file @
ccce66be
echo
"=== Downloading BERT pre-trained weights ==="
echo
"---"
wget
--quiet
--continue
http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
tar
-xzf
simple-examples.tgz
rm
-rf
simple-examples.tgz
example.py
0 → 100644
View file @
ccce66be
"""
Show how to use HuggingFace's PyTorch implementation of Google's BERT Model.
"""
from
.bert_model
import
BERT
from
.prepare_inputs
import
DataPreprocessor
bert_model
=
BERT
()
bert_model
.
load_from
(
'.'
)
data_processor
=
DataProcessor
(
encoder_file_path
=
'.'
)
input_sentence
=
"We are playing with the BERT model."
print
(
"BERT inputs: {}"
.
format
(
input_sentence
))
tensor_input
=
data_processor
.
encode
(
input_sentence
)
tensor_output
=
bert_model
(
prepared_input
)
output_sentence
=
data_processor
.
decode
(
tensor_output
)
print
(
"BERT predicted: {}"
.
format
(
output_sentence
))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment