Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
339e556f
Commit
339e556f
authored
Aug 09, 2019
by
LysandreJik
Browse files
CLM for BERT, beginning of CLM fot RoBERTa; still needs a better masking token mechanism.
parent
5c18825a
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
48 additions
and
14 deletions
+48
-14
examples/run_generative_finetuning.py
examples/run_generative_finetuning.py
+48
-14
No files found.
examples/run_generative_finetuning.py
View file @
339e556f
...
@@ -13,7 +13,11 @@
...
@@ -13,7 +13,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
""" Finetuning the library models for language modeling on WikiText-2 (GPT, GPT-2, XLM)."""
"""
Fine-tuning the library models for language modeling on WikiText-2 (GPT, GPT-2, BERT, RoBERTa).
GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
using a masked language modeling (MLM) loss.
"""
from
__future__
import
absolute_import
,
division
,
print_function
from
__future__
import
absolute_import
,
division
,
print_function
...
@@ -30,8 +34,10 @@ from torch.utils.data.distributed import DistributedSampler
...
@@ -30,8 +34,10 @@ from torch.utils.data.distributed import DistributedSampler
from
tensorboardX
import
SummaryWriter
from
tensorboardX
import
SummaryWriter
from
tqdm
import
tqdm
,
trange
from
tqdm
import
tqdm
,
trange
from
pytorch_transformers
import
(
WEIGHTS_NAME
,
GPT2Config
,
GPT2LMHeadModel
,
GPT2Tokenizer
,
from
pytorch_transformers
import
(
WEIGHTS_NAME
,
GPT2Config
,
GPT2LMHeadModel
,
GPT2Tokenizer
,
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
,
OpenAIGPTConfig
,
OpenAIGPTLMHeadModel
,
OpenAIGPTTokenizer
)
OpenAIGPTConfig
,
OpenAIGPTLMHeadModel
,
OpenAIGPTTokenizer
,
OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
,
BertConfig
,
BertForMaskedLM
,
BertTokenizer
,
BERT_PRETRAINED_MODEL_ARCHIVE_MAP
,
RobertaConfig
,
RobertaForMaskedLM
,
RobertaTokenizer
,
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
)
from
pytorch_transformers
import
AdamW
,
WarmupLinearSchedule
from
pytorch_transformers
import
AdamW
,
WarmupLinearSchedule
from
utils_lm
import
WikiTextDataset
from
utils_lm
import
WikiTextDataset
...
@@ -42,7 +48,9 @@ ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (
...
@@ -42,7 +48,9 @@ ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (
MODEL_CLASSES
=
{
MODEL_CLASSES
=
{
'gpt2'
:
(
GPT2Config
,
GPT2LMHeadModel
,
GPT2Tokenizer
),
'gpt2'
:
(
GPT2Config
,
GPT2LMHeadModel
,
GPT2Tokenizer
),
'openai-gpt'
:
(
OpenAIGPTConfig
,
OpenAIGPTLMHeadModel
,
OpenAIGPTTokenizer
)
'openai-gpt'
:
(
OpenAIGPTConfig
,
OpenAIGPTLMHeadModel
,
OpenAIGPTTokenizer
),
"bert"
:
(
BertConfig
,
BertForMaskedLM
,
BertTokenizer
),
"roberta"
:
(
RobertaConfig
,
RobertaForMaskedLM
,
RobertaTokenizer
)
}
}
...
@@ -53,6 +61,18 @@ def set_seed(args):
...
@@ -53,6 +61,18 @@ def set_seed(args):
if
args
.
n_gpu
>
0
:
if
args
.
n_gpu
>
0
:
torch
.
cuda
.
manual_seed_all
(
args
.
seed
)
torch
.
cuda
.
manual_seed_all
(
args
.
seed
)
# Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original
def
mask_tokens
(
inputs
,
tokenizer
,
args
):
labels
=
inputs
.
clone
()
masked_indices
=
torch
.
bernoulli
(
torch
.
full
(
labels
.
shape
,
args
.
mlm_probability
)).
byte
()
labels
[
~
masked_indices
]
=
-
1
# We only compute loss on masked tokens
indices_replaced
=
torch
.
bernoulli
(
torch
.
full
(
labels
.
shape
,
0.8
)).
byte
()
&
masked_indices
inputs
[
indices_replaced
]
=
tokenizer
.
vocab
[
"[MASK]"
]
# 80% of the time, replace masked input tokens with [MASK]
indices_random
=
torch
.
bernoulli
(
torch
.
full
(
labels
.
shape
,
0.5
)).
byte
()
&
masked_indices
&
~
indices_replaced
random_words
=
torch
.
randint
(
args
.
num_embeddings
,
labels
.
shape
,
dtype
=
torch
.
long
,
device
=
args
.
device
)
inputs
[
indices_random
]
=
random_words
[
indices_random
]
# 10% of the time, replace masked input tokens with random word
return
inputs
,
labels
def
train
(
args
,
train_dataset
,
model
,
tokenizer
):
def
train
(
args
,
train_dataset
,
model
,
tokenizer
):
""" Train the model """
""" Train the model """
...
@@ -108,13 +128,14 @@ def train(args, train_dataset, model, tokenizer):
...
@@ -108,13 +128,14 @@ def train(args, train_dataset, model, tokenizer):
tr_loss
,
logging_loss
=
0.0
,
0.0
tr_loss
,
logging_loss
=
0.0
,
0.0
model
.
zero_grad
()
model
.
zero_grad
()
train_iterator
=
trange
(
int
(
args
.
num_train_epochs
),
desc
=
"Epoch"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
])
train_iterator
=
trange
(
int
(
args
.
num_train_epochs
),
desc
=
"Epoch"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
])
set_seed
(
args
)
# Added here for reproduc
t
ibility (even between python 2 and 3)
set_seed
(
args
)
# Added here for reproducibility (even between python 2 and 3)
for
_
in
train_iterator
:
for
_
in
train_iterator
:
epoch_iterator
=
tqdm
(
train_dataloader
,
desc
=
"Iteration"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
])
epoch_iterator
=
tqdm
(
train_dataloader
,
desc
=
"Iteration"
,
disable
=
args
.
local_rank
not
in
[
-
1
,
0
])
for
step
,
batch
in
enumerate
(
epoch_iterator
):
for
step
,
batch
in
enumerate
(
epoch_iterator
):
batch
.
to
(
args
.
device
)
batch
.
to
(
args
.
device
)
model
.
train
()
model
.
train
()
outputs
=
model
(
batch
,
labels
=
batch
)
inputs
,
labels
=
mask_tokens
(
batch
,
tokenizer
,
args
)
if
args
.
mlm
else
(
batch
,
batch
)
outputs
=
model
(
inputs
,
masked_lm_labels
=
labels
)
if
args
.
mlm
else
model
(
inputs
,
labels
=
labels
)
loss
=
outputs
[
0
]
# model outputs are always tuple in pytorch-transformers (see doc)
loss
=
outputs
[
0
]
# model outputs are always tuple in pytorch-transformers (see doc)
if
args
.
n_gpu
>
1
:
if
args
.
n_gpu
>
1
:
...
@@ -132,8 +153,8 @@ def train(args, train_dataset, model, tokenizer):
...
@@ -132,8 +153,8 @@ def train(args, train_dataset, model, tokenizer):
tr_loss
+=
loss
.
item
()
tr_loss
+=
loss
.
item
()
if
(
step
+
1
)
%
args
.
gradient_accumulation_steps
==
0
:
if
(
step
+
1
)
%
args
.
gradient_accumulation_steps
==
0
:
scheduler
.
step
()
# Update learning rate schedule
optimizer
.
step
()
optimizer
.
step
()
scheduler
.
step
()
# Update learning rate schedule
model
.
zero_grad
()
model
.
zero_grad
()
global_step
+=
1
global_step
+=
1
...
@@ -196,7 +217,7 @@ def evaluate(args, model, tokenizer, prefix=""):
...
@@ -196,7 +217,7 @@ def evaluate(args, model, tokenizer, prefix=""):
batch
.
to
(
args
.
device
)
batch
.
to
(
args
.
device
)
with
torch
.
no_grad
():
with
torch
.
no_grad
():
outputs
=
model
(
batch
,
labels
=
batch
)
outputs
=
model
(
batch
)
lm_loss
=
outputs
[
0
]
lm_loss
=
outputs
[
0
]
eval_loss
+=
lm_loss
.
mean
().
item
()
eval_loss
+=
lm_loss
.
mean
().
item
()
nb_eval_steps
+=
1
nb_eval_steps
+=
1
...
@@ -236,8 +257,16 @@ def main():
...
@@ -236,8 +257,16 @@ def main():
help
=
"The output directory where the model predictions and checkpoints will be written."
)
help
=
"The output directory where the model predictions and checkpoints will be written."
)
## Other parameters
## Other parameters
parser
.
add_argument
(
"--model_name_or_path"
,
default
=
"gpt2"
,
type
=
str
,
parser
.
add_argument
(
"--model_name"
,
default
=
"bert"
,
type
=
str
,
help
=
"The model to be fine-tuned."
)
help
=
"The model architecture to be fine-tuned."
)
parser
.
add_argument
(
"--model_checkpoint"
,
default
=
"bert-base-cased"
,
type
=
str
,
help
=
"The model checkpoint for weights initialization."
)
parser
.
add_argument
(
"--mlm"
,
action
=
'store_true'
,
help
=
"Train with masked-language modeling loss instead of language modeling."
)
parser
.
add_argument
(
"--mlm_probability"
,
type
=
float
,
default
=
0.15
,
help
=
"Ratio of tokens to mask for masked language modeling loss"
)
parser
.
add_argument
(
"--config_name"
,
default
=
""
,
type
=
str
,
parser
.
add_argument
(
"--config_name"
,
default
=
""
,
type
=
str
,
help
=
"Pretrained config name or path if not the same as model_name"
)
help
=
"Pretrained config name or path if not the same as model_name"
)
parser
.
add_argument
(
"--tokenizer_name"
,
default
=
""
,
type
=
str
,
parser
.
add_argument
(
"--tokenizer_name"
,
default
=
""
,
type
=
str
,
...
@@ -303,6 +332,10 @@ def main():
...
@@ -303,6 +332,10 @@ def main():
parser
.
add_argument
(
'--server_port'
,
type
=
str
,
default
=
''
,
help
=
"For distant debugging."
)
parser
.
add_argument
(
'--server_port'
,
type
=
str
,
default
=
''
,
help
=
"For distant debugging."
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
if
args
.
model_name
in
[
"bert"
,
"roberta"
]
and
not
args
.
mlm
:
raise
ValueError
(
"BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
"flag (masked language modeling)."
)
if
os
.
path
.
exists
(
args
.
output_dir
)
and
os
.
listdir
(
args
.
output_dir
)
and
args
.
do_train
and
not
args
.
overwrite_output_dir
:
if
os
.
path
.
exists
(
args
.
output_dir
)
and
os
.
listdir
(
args
.
output_dir
)
and
args
.
do_train
and
not
args
.
overwrite_output_dir
:
raise
ValueError
(
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
.
format
(
args
.
output_dir
))
raise
ValueError
(
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
.
format
(
args
.
output_dir
))
...
@@ -339,10 +372,11 @@ def main():
...
@@ -339,10 +372,11 @@ def main():
if
args
.
local_rank
not
in
[
-
1
,
0
]:
if
args
.
local_rank
not
in
[
-
1
,
0
]:
torch
.
distributed
.
barrier
()
# Make sure only the first process in distributed training will download model & vocab
torch
.
distributed
.
barrier
()
# Make sure only the first process in distributed training will download model & vocab
config_class
,
model_class
,
tokenizer_class
=
MODEL_CLASSES
[
args
.
model_name_or_path
]
config_class
,
model_class
,
tokenizer_class
=
MODEL_CLASSES
[
args
.
model_name
]
config
=
config_class
.
from_pretrained
(
args
.
config_name
if
args
.
config_name
else
args
.
model_name_or_path
)
config
=
config_class
.
from_pretrained
(
args
.
config_name
if
args
.
config_name
else
args
.
model_checkpoint
)
tokenizer
=
tokenizer_class
.
from_pretrained
(
args
.
tokenizer_name
if
args
.
tokenizer_name
else
args
.
model_name_or_path
,
do_lower_case
=
args
.
do_lower_case
)
tokenizer
=
tokenizer_class
.
from_pretrained
(
args
.
tokenizer_name
if
args
.
tokenizer_name
else
args
.
model_checkpoint
,
do_lower_case
=
args
.
do_lower_case
)
model
=
model_class
.
from_pretrained
(
args
.
model_name_or_path
,
from_tf
=
bool
(
'.ckpt'
in
args
.
model_name_or_path
),
config
=
config
)
model
=
model_class
.
from_pretrained
(
args
.
model_checkpoint
,
from_tf
=
bool
(
'.ckpt'
in
args
.
model_checkpoint
),
config
=
config
)
args
.
num_embeddings
=
config
.
vocab_size
# We need this to create the model at next line (number of embeddings to use)
if
args
.
local_rank
==
0
:
if
args
.
local_rank
==
0
:
torch
.
distributed
.
barrier
()
# Make sure only the first process in distributed training will download model & vocab
torch
.
distributed
.
barrier
()
# Make sure only the first process in distributed training will download model & vocab
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment