Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
01b68be3
Commit
01b68be3
authored
Dec 18, 2019
by
Stefan Schweter
Browse files
converter: remove XLM-RoBERTa specific script (can be done with the script for RoBERTa now)
parent
ca31abc6
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
0 additions
and
184 deletions
+0
-184
transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py
...ert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py
+0
-184
No files found.
transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py
deleted
100644 → 0
View file @
ca31abc6
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert RoBERTa checkpoint."""
from
__future__
import
absolute_import
,
division
,
print_function
import
argparse
import
logging
import
numpy
as
np
import
torch
import
pathlib
from
fairseq.models.roberta
import
RobertaModel
as
FairseqRobertaModel
from
fairseq.modules
import
TransformerSentenceEncoderLayer
from
transformers.modeling_bert
import
(
BertConfig
,
BertEncoder
,
BertIntermediate
,
BertLayer
,
BertModel
,
BertOutput
,
BertSelfAttention
,
BertSelfOutput
)
from
transformers.modeling_roberta
import
(
RobertaEmbeddings
,
RobertaForMaskedLM
,
RobertaForSequenceClassification
,
RobertaModel
)
logging
.
basicConfig
(
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
SAMPLE_TEXT
=
'Hello world! cécé herlolip'
def
convert_roberta_checkpoint_to_pytorch
(
roberta_checkpoint_path
,
pytorch_dump_folder_path
,
classification_head
):
"""
Copy/paste/tweak roberta's weights to our BERT structure.
"""
roberta
=
FairseqRobertaModel
.
from_pretrained
(
roberta_checkpoint_path
,
bpe
=
'sentencepiece'
)
roberta
.
eval
()
# disable dropout
config
=
BertConfig
(
vocab_size
=
250002
,
hidden_size
=
roberta
.
args
.
encoder_embed_dim
,
num_hidden_layers
=
roberta
.
args
.
encoder_layers
,
num_attention_heads
=
roberta
.
args
.
encoder_attention_heads
,
intermediate_size
=
roberta
.
args
.
encoder_ffn_embed_dim
,
max_position_embeddings
=
514
,
type_vocab_size
=
1
,
layer_norm_eps
=
1e-5
,
# PyTorch default used in fairseq
)
if
classification_head
:
config
.
num_labels
=
roberta
.
args
.
num_classes
print
(
"Our BERT config:"
,
config
)
model
=
RobertaForSequenceClassification
(
config
)
if
classification_head
else
RobertaForMaskedLM
(
config
)
model
.
eval
()
# Now let's copy all the weights.
# Embeddings
roberta_sent_encoder
=
roberta
.
model
.
decoder
.
sentence_encoder
model
.
roberta
.
embeddings
.
word_embeddings
.
weight
=
roberta_sent_encoder
.
embed_tokens
.
weight
model
.
roberta
.
embeddings
.
position_embeddings
.
weight
=
roberta_sent_encoder
.
embed_positions
.
weight
model
.
roberta
.
embeddings
.
token_type_embeddings
.
weight
.
data
=
torch
.
zeros_like
(
model
.
roberta
.
embeddings
.
token_type_embeddings
.
weight
)
# just zero them out b/c RoBERTa doesn't use them.
model
.
roberta
.
embeddings
.
LayerNorm
.
weight
=
roberta_sent_encoder
.
emb_layer_norm
.
weight
model
.
roberta
.
embeddings
.
LayerNorm
.
bias
=
roberta_sent_encoder
.
emb_layer_norm
.
bias
for
i
in
range
(
config
.
num_hidden_layers
):
# Encoder: start of layer
layer
:
BertLayer
=
model
.
roberta
.
encoder
.
layer
[
i
]
roberta_layer
:
TransformerSentenceEncoderLayer
=
roberta_sent_encoder
.
layers
[
i
]
### self attention
self_attn
:
BertSelfAttention
=
layer
.
attention
.
self
assert
(
roberta_layer
.
self_attn
.
k_proj
.
weight
.
data
.
shape
==
\
roberta_layer
.
self_attn
.
q_proj
.
weight
.
data
.
shape
==
\
roberta_layer
.
self_attn
.
v_proj
.
weight
.
data
.
shape
==
\
torch
.
Size
((
config
.
hidden_size
,
config
.
hidden_size
))
)
self_attn
.
query
.
weight
.
data
=
roberta_layer
.
self_attn
.
q_proj
.
weight
self_attn
.
query
.
bias
.
data
=
roberta_layer
.
self_attn
.
q_proj
.
bias
self_attn
.
key
.
weight
.
data
=
roberta_layer
.
self_attn
.
k_proj
.
weight
self_attn
.
key
.
bias
.
data
=
roberta_layer
.
self_attn
.
k_proj
.
bias
self_attn
.
value
.
weight
.
data
=
roberta_layer
.
self_attn
.
v_proj
.
weight
self_attn
.
value
.
bias
.
data
=
roberta_layer
.
self_attn
.
v_proj
.
bias
### self-attention output
self_output
:
BertSelfOutput
=
layer
.
attention
.
output
assert
(
self_output
.
dense
.
weight
.
shape
==
roberta_layer
.
self_attn
.
out_proj
.
weight
.
shape
)
self_output
.
dense
.
weight
=
roberta_layer
.
self_attn
.
out_proj
.
weight
self_output
.
dense
.
bias
=
roberta_layer
.
self_attn
.
out_proj
.
bias
self_output
.
LayerNorm
.
weight
=
roberta_layer
.
self_attn_layer_norm
.
weight
self_output
.
LayerNorm
.
bias
=
roberta_layer
.
self_attn_layer_norm
.
bias
### intermediate
intermediate
:
BertIntermediate
=
layer
.
intermediate
assert
(
intermediate
.
dense
.
weight
.
shape
==
roberta_layer
.
fc1
.
weight
.
shape
)
intermediate
.
dense
.
weight
=
roberta_layer
.
fc1
.
weight
intermediate
.
dense
.
bias
=
roberta_layer
.
fc1
.
bias
### output
bert_output
:
BertOutput
=
layer
.
output
assert
(
bert_output
.
dense
.
weight
.
shape
==
roberta_layer
.
fc2
.
weight
.
shape
)
bert_output
.
dense
.
weight
=
roberta_layer
.
fc2
.
weight
bert_output
.
dense
.
bias
=
roberta_layer
.
fc2
.
bias
bert_output
.
LayerNorm
.
weight
=
roberta_layer
.
final_layer_norm
.
weight
bert_output
.
LayerNorm
.
bias
=
roberta_layer
.
final_layer_norm
.
bias
#### end of layer
if
classification_head
:
model
.
classifier
.
dense
.
weight
=
roberta
.
model
.
classification_heads
[
'mnli'
].
dense
.
weight
model
.
classifier
.
dense
.
bias
=
roberta
.
model
.
classification_heads
[
'mnli'
].
dense
.
bias
model
.
classifier
.
out_proj
.
weight
=
roberta
.
model
.
classification_heads
[
'mnli'
].
out_proj
.
weight
model
.
classifier
.
out_proj
.
bias
=
roberta
.
model
.
classification_heads
[
'mnli'
].
out_proj
.
bias
else
:
# LM Head
model
.
lm_head
.
dense
.
weight
=
roberta
.
model
.
decoder
.
lm_head
.
dense
.
weight
model
.
lm_head
.
dense
.
bias
=
roberta
.
model
.
decoder
.
lm_head
.
dense
.
bias
model
.
lm_head
.
layer_norm
.
weight
=
roberta
.
model
.
decoder
.
lm_head
.
layer_norm
.
weight
model
.
lm_head
.
layer_norm
.
bias
=
roberta
.
model
.
decoder
.
lm_head
.
layer_norm
.
bias
model
.
lm_head
.
decoder
.
weight
=
roberta
.
model
.
decoder
.
lm_head
.
weight
model
.
lm_head
.
bias
=
roberta
.
model
.
decoder
.
lm_head
.
bias
# Let's check that we get the same results.
input_ids
:
torch
.
Tensor
=
roberta
.
encode
(
SAMPLE_TEXT
).
unsqueeze
(
0
)
# batch of size 1
our_output
=
model
(
input_ids
)[
0
]
if
classification_head
:
their_output
=
roberta
.
model
.
classification_heads
[
'mnli'
](
roberta
.
extract_features
(
input_ids
))
else
:
their_output
=
roberta
.
model
(
input_ids
)[
0
]
print
(
our_output
.
shape
,
their_output
.
shape
)
max_absolute_diff
=
torch
.
max
(
torch
.
abs
(
our_output
-
their_output
)).
item
()
print
(
f
"max_absolute_diff =
{
max_absolute_diff
}
"
)
# ~ 1e-7
success
=
torch
.
allclose
(
our_output
,
their_output
,
atol
=
1e-3
)
print
(
"Do both models output the same tensors?"
,
"🔥"
if
success
else
"💩"
)
if
not
success
:
raise
Exception
(
"Something went wRoNg"
)
pathlib
.
Path
(
pytorch_dump_folder_path
).
mkdir
(
parents
=
True
,
exist_ok
=
True
)
print
(
f
"Saving model to
{
pytorch_dump_folder_path
}
"
)
model
.
save_pretrained
(
pytorch_dump_folder_path
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
## Required parameters
parser
.
add_argument
(
"--roberta_checkpoint_path"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Path the official PyTorch dump."
)
parser
.
add_argument
(
"--pytorch_dump_folder_path"
,
default
=
None
,
type
=
str
,
required
=
True
,
help
=
"Path to the output PyTorch model."
)
parser
.
add_argument
(
"--classification_head"
,
action
=
"store_true"
,
help
=
"Whether to convert a final classification head."
)
args
=
parser
.
parse_args
()
convert_roberta_checkpoint_to_pytorch
(
args
.
roberta_checkpoint_path
,
args
.
pytorch_dump_folder_path
,
args
.
classification_head
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment