Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
632f2d2d
"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "8a6881822a14bfffe7f30f900a067024736ca612"
Unverified
Commit
632f2d2d
authored
Dec 11, 2018
by
Thomas Wolf
Committed by
GitHub
Dec 11, 2018
Browse files
Merge branch 'master' into fourth-release
parents
b13abfa9
a3a3180c
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
28 additions
and
33 deletions
+28
-33
README.md
README.md
+1
-1
examples/run_classifier.py
examples/run_classifier.py
+24
-30
pytorch_pretrained_bert/optimization.py
pytorch_pretrained_bert/optimization.py
+3
-2
No files found.
README.md
View file @
632f2d2d
...
@@ -19,7 +19,7 @@ This implementation is provided with [Google's pre-trained models](https://githu
...
@@ -19,7 +19,7 @@ This implementation is provided with [Google's pre-trained models](https://githu
## Installation
## Installation
This repo was tested on Python 3.
5
+ and PyTorch 0.4.1
This repo was tested on Python 3.
6
+ and PyTorch 0.4.1
### With pip
### With pip
...
...
examples/run_classifier.py
View file @
632f2d2d
...
@@ -35,7 +35,7 @@ from pytorch_pretrained_bert.modeling import BertForSequenceClassification
...
@@ -35,7 +35,7 @@ from pytorch_pretrained_bert.modeling import BertForSequenceClassification
from
pytorch_pretrained_bert.optimization
import
BertAdam
from
pytorch_pretrained_bert.optimization
import
BertAdam
from
pytorch_pretrained_bert.file_utils
import
PYTORCH_PRETRAINED_BERT_CACHE
from
pytorch_pretrained_bert.file_utils
import
PYTORCH_PRETRAINED_BERT_CACHE
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
level
=
logging
.
INFO
)
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
...
@@ -196,9 +196,7 @@ class ColaProcessor(DataProcessor):
...
@@ -196,9 +196,7 @@ class ColaProcessor(DataProcessor):
def
convert_examples_to_features
(
examples
,
label_list
,
max_seq_length
,
tokenizer
):
def
convert_examples_to_features
(
examples
,
label_list
,
max_seq_length
,
tokenizer
):
"""Loads a data file into a list of `InputBatch`s."""
"""Loads a data file into a list of `InputBatch`s."""
label_map
=
{}
label_map
=
{
label
:
i
for
i
,
label
in
enumerate
(
label_list
)}
for
(
i
,
label
)
in
enumerate
(
label_list
):
label_map
[
label
]
=
i
features
=
[]
features
=
[]
for
(
ex_index
,
example
)
in
enumerate
(
examples
):
for
(
ex_index
,
example
)
in
enumerate
(
examples
):
...
@@ -207,8 +205,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
...
@@ -207,8 +205,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
tokens_b
=
None
tokens_b
=
None
if
example
.
text_b
:
if
example
.
text_b
:
tokens_b
=
tokenizer
.
tokenize
(
example
.
text_b
)
tokens_b
=
tokenizer
.
tokenize
(
example
.
text_b
)
if
tokens_b
:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
# Account for [CLS], [SEP], [SEP] with "- 3"
...
@@ -216,7 +212,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
...
@@ -216,7 +212,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
else
:
else
:
# Account for [CLS] and [SEP] with "- 2"
# Account for [CLS] and [SEP] with "- 2"
if
len
(
tokens_a
)
>
max_seq_length
-
2
:
if
len
(
tokens_a
)
>
max_seq_length
-
2
:
tokens_a
=
tokens_a
[
0
:(
max_seq_length
-
2
)]
tokens_a
=
tokens_a
[:(
max_seq_length
-
2
)]
# The convention in BERT is:
# The convention in BERT is:
# (a) For sequence pairs:
# (a) For sequence pairs:
...
@@ -236,22 +232,12 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
...
@@ -236,22 +232,12 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
# For classification tasks, the first vector (corresponding to [CLS]) is
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
# the entire model is fine-tuned.
tokens
=
[]
tokens
=
[
"[CLS]"
]
+
tokens_a
+
[
"[SEP]"
]
segment_ids
=
[]
segment_ids
=
[
0
]
*
len
(
tokens
)
tokens
.
append
(
"[CLS]"
)
segment_ids
.
append
(
0
)
for
token
in
tokens_a
:
tokens
.
append
(
token
)
segment_ids
.
append
(
0
)
tokens
.
append
(
"[SEP]"
)
segment_ids
.
append
(
0
)
if
tokens_b
:
if
tokens_b
:
for
token
in
tokens_b
:
tokens
+=
tokens_b
+
[
"[SEP]"
]
tokens
.
append
(
token
)
segment_ids
+=
[
1
]
*
(
len
(
tokens_b
)
+
1
)
segment_ids
.
append
(
1
)
tokens
.
append
(
"[SEP]"
)
segment_ids
.
append
(
1
)
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
input_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
...
@@ -260,10 +246,10 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
...
@@ -260,10 +246,10 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
input_mask
=
[
1
]
*
len
(
input_ids
)
input_mask
=
[
1
]
*
len
(
input_ids
)
# Zero-pad up to the sequence length.
# Zero-pad up to the sequence length.
while
len
(
input_ids
)
<
max_seq_length
:
padding
=
[
0
]
*
(
max_seq_length
-
len
(
input_ids
))
input_ids
.
append
(
0
)
input_ids
+=
padding
input_mask
.
append
(
0
)
input_mask
+=
padding
segment_ids
.
append
(
0
)
segment_ids
+=
padding
assert
len
(
input_ids
)
==
max_seq_length
assert
len
(
input_ids
)
==
max_seq_length
assert
len
(
input_mask
)
==
max_seq_length
assert
len
(
input_mask
)
==
max_seq_length
...
@@ -409,14 +395,14 @@ def main():
...
@@ -409,14 +395,14 @@ def main():
type
=
int
,
type
=
int
,
default
=-
1
,
default
=-
1
,
help
=
"local_rank for distributed training on gpus"
)
help
=
"local_rank for distributed training on gpus"
)
parser
.
add_argument
(
'--seed'
,
parser
.
add_argument
(
'--seed'
,
type
=
int
,
type
=
int
,
default
=
42
,
default
=
42
,
help
=
"random seed for initialization"
)
help
=
"random seed for initialization"
)
parser
.
add_argument
(
'--gradient_accumulation_steps'
,
parser
.
add_argument
(
'--gradient_accumulation_steps'
,
type
=
int
,
type
=
int
,
default
=
1
,
default
=
1
,
help
=
"Number of updates steps to accumulate before performing a backward/update pass."
)
help
=
"Number of updates steps to accumulate before performing a backward/update pass."
)
parser
.
add_argument
(
'--optimize_on_cpu'
,
parser
.
add_argument
(
'--optimize_on_cpu'
,
default
=
False
,
default
=
False
,
action
=
'store_true'
,
action
=
'store_true'
,
...
@@ -437,6 +423,12 @@ def main():
...
@@ -437,6 +423,12 @@ def main():
"mrpc"
:
MrpcProcessor
,
"mrpc"
:
MrpcProcessor
,
}
}
num_labels_task
=
{
"cola"
:
2
,
"mnli"
:
3
,
"mrpc"
:
2
,
}
if
args
.
local_rank
==
-
1
or
args
.
no_cuda
:
if
args
.
local_rank
==
-
1
or
args
.
no_cuda
:
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
and
not
args
.
no_cuda
else
"cpu"
)
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
and
not
args
.
no_cuda
else
"cpu"
)
n_gpu
=
torch
.
cuda
.
device_count
()
n_gpu
=
torch
.
cuda
.
device_count
()
...
@@ -475,6 +467,7 @@ def main():
...
@@ -475,6 +467,7 @@ def main():
raise
ValueError
(
"Task not found: %s"
%
(
task_name
))
raise
ValueError
(
"Task not found: %s"
%
(
task_name
))
processor
=
processors
[
task_name
]()
processor
=
processors
[
task_name
]()
num_labels
=
num_labels_task
[
task_name
]
label_list
=
processor
.
get_labels
()
label_list
=
processor
.
get_labels
()
tokenizer
=
BertTokenizer
.
from_pretrained
(
args
.
bert_model
,
do_lower_case
=
args
.
do_lower_case
)
tokenizer
=
BertTokenizer
.
from_pretrained
(
args
.
bert_model
,
do_lower_case
=
args
.
do_lower_case
)
...
@@ -487,8 +480,9 @@ def main():
...
@@ -487,8 +480,9 @@ def main():
len
(
train_examples
)
/
args
.
train_batch_size
/
args
.
gradient_accumulation_steps
*
args
.
num_train_epochs
)
len
(
train_examples
)
/
args
.
train_batch_size
/
args
.
gradient_accumulation_steps
*
args
.
num_train_epochs
)
# Prepare model
# Prepare model
cache_dir
=
PYTORCH_PRETRAINED_BERT_CACHE
/
'distributed_{}'
.
format
(
args
.
local_rank
)
# for distributed learning
model
=
BertForSequenceClassification
.
from_pretrained
(
args
.
bert_model
,
model
=
BertForSequenceClassification
.
from_pretrained
(
args
.
bert_model
,
cache_dir
=
cache_dir
)
cache_dir
=
PYTORCH_PRETRAINED_BERT_CACHE
/
'distributed_{}'
.
format
(
args
.
local_rank
),
num_labels
=
num_labels
)
if
args
.
fp16
:
if
args
.
fp16
:
model
.
half
()
model
.
half
()
model
.
to
(
device
)
model
.
to
(
device
)
...
...
pytorch_pretrained_bert/optimization.py
View file @
632f2d2d
...
@@ -17,6 +17,7 @@
...
@@ -17,6 +17,7 @@
import
math
import
math
import
torch
import
torch
from
torch.optim
import
Optimizer
from
torch.optim
import
Optimizer
from
torch.optim.optimizer
import
required
from
torch.nn.utils
import
clip_grad_norm_
from
torch.nn.utils
import
clip_grad_norm_
def
warmup_cosine
(
x
,
warmup
=
0.002
):
def
warmup_cosine
(
x
,
warmup
=
0.002
):
...
@@ -55,10 +56,10 @@ class BertAdam(Optimizer):
...
@@ -55,10 +56,10 @@ class BertAdam(Optimizer):
weight_decay_rate: Weight decay. Default: 0.01
weight_decay_rate: Weight decay. Default: 0.01
max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
"""
"""
def
__init__
(
self
,
params
,
lr
,
warmup
=-
1
,
t_total
=-
1
,
schedule
=
'warmup_linear'
,
def
__init__
(
self
,
params
,
lr
=
required
,
warmup
=-
1
,
t_total
=-
1
,
schedule
=
'warmup_linear'
,
b1
=
0.9
,
b2
=
0.999
,
e
=
1e-6
,
weight_decay_rate
=
0.01
,
b1
=
0.9
,
b2
=
0.999
,
e
=
1e-6
,
weight_decay_rate
=
0.01
,
max_grad_norm
=
1.0
):
max_grad_norm
=
1.0
):
if
not
lr
>=
0.0
:
if
lr
is
not
required
and
lr
<
0.0
:
raise
ValueError
(
"Invalid learning rate: {} - should be >= 0.0"
.
format
(
lr
))
raise
ValueError
(
"Invalid learning rate: {} - should be >= 0.0"
.
format
(
lr
))
if
schedule
not
in
SCHEDULES
:
if
schedule
not
in
SCHEDULES
:
raise
ValueError
(
"Invalid schedule parameter: {}"
.
format
(
schedule
))
raise
ValueError
(
"Invalid schedule parameter: {}"
.
format
(
schedule
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment