Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
fa5222c2
Commit
fa5222c2
authored
Jan 10, 2019
by
thomwolf
Browse files
update readme
parent
ab90d4cd
Changes
2
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
206 additions
and
322 deletions
+206
-322
README.md
README.md
+206
-18
examples/run_openai_gpt.py
examples/run_openai_gpt.py
+0
-304
No files found.
README.md
View file @
fa5222c2
This diff is collapsed.
Click to expand it.
examples/run_openai_gpt.py
deleted
100644 → 0
View file @
ab90d4cd
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
" Run OpenAI GPT on RocStories"
import
argparse
import
os
import
random
import
logging
from
sklearn.metrics
import
accuracy_score
from
sklearn.utils
import
shuffle
# from analysis import rocstories as rocstories_analysis
# from datasets import rocstories
# from model_pytorch import DoubleHeadModel, load_openai_pretrained_model
# from opt import OpenAIAdam
# from text_utils import TextEncoder
# from utils import (encode_dataset, iter_data,
# ResultLogger, make_path)
# from loss import MultipleChoiceLossCompute
import
numpy
as
np
import
torch
from
torch.utils.data
import
TensorDataset
,
DataLoader
,
RandomSampler
,
SequentialSampler
from
torch.utils.data.distributed
import
DistributedSampler
from
pytorch_pretrained_bert.tokenization_openai
import
OpenAIGPTTokenizer
from
pytorch_pretrained_bert.modeling_openai
import
OpenAIGPTDoubleHeadsModel
from
pytorch_pretrained_bert.optimization_openai
import
OpenAIAdam
from
pytorch_pretrained_bert.file_utils
import
PYTORCH_PRETRAINED_BERT_CACHE
logging
.
basicConfig
(
format
=
'%(asctime)s - %(levelname)s - %(name)s - %(message)s'
,
datefmt
=
'%m/%d/%Y %H:%M:%S'
,
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
def
transform_roc
(
X1
,
X2
,
X3
):
n_batch
=
len
(
X1
)
xmb
=
np
.
zeros
((
n_batch
,
2
,
n_ctx
,
2
),
dtype
=
np
.
int32
)
mmb
=
np
.
zeros
((
n_batch
,
2
,
n_ctx
),
dtype
=
np
.
float32
)
start
=
encoder
[
'_start_'
]
delimiter
=
encoder
[
'_delimiter_'
]
for
i
,
(
x1
,
x2
,
x3
),
in
enumerate
(
zip
(
X1
,
X2
,
X3
)):
x12
=
[
start
]
+
x1
[:
max_len
]
+
[
delimiter
]
+
x2
[:
max_len
]
+
[
clf_token
]
x13
=
[
start
]
+
x1
[:
max_len
]
+
[
delimiter
]
+
x3
[:
max_len
]
+
[
clf_token
]
l12
=
len
(
x12
)
l13
=
len
(
x13
)
xmb
[
i
,
0
,
:
l12
,
0
]
=
x12
xmb
[
i
,
1
,
:
l13
,
0
]
=
x13
mmb
[
i
,
0
,
:
l12
]
=
1
mmb
[
i
,
1
,
:
l13
]
=
1
# Position information that is added to the input embeddings in the TransformerModel
xmb
[:,
:,
:,
1
]
=
np
.
arange
(
n_vocab
+
n_special
,
n_vocab
+
n_special
+
n_ctx
)
return
xmb
,
mmb
def
iter_apply
(
Xs
,
Ms
,
Ys
):
# fns = [lambda x: np.concatenate(x, 0), lambda x: float(np.sum(x))]
logits
=
[]
cost
=
0
with
torch
.
no_grad
():
dh_model
.
eval
()
for
xmb
,
mmb
,
ymb
in
iter_data
(
Xs
,
Ms
,
Ys
,
n_batch
=
n_batch_train
,
truncate
=
False
,
verbose
=
True
):
n
=
len
(
xmb
)
XMB
=
torch
.
tensor
(
xmb
,
dtype
=
torch
.
long
).
to
(
device
)
YMB
=
torch
.
tensor
(
ymb
,
dtype
=
torch
.
long
).
to
(
device
)
MMB
=
torch
.
tensor
(
mmb
).
to
(
device
)
_
,
clf_logits
=
dh_model
(
XMB
)
clf_logits
*=
n
clf_losses
=
compute_loss_fct
(
XMB
,
YMB
,
MMB
,
clf_logits
,
only_return_losses
=
True
)
clf_losses
*=
n
logits
.
append
(
clf_logits
.
to
(
"cpu"
).
numpy
())
cost
+=
clf_losses
.
sum
().
item
()
logits
=
np
.
concatenate
(
logits
,
0
)
return
logits
,
cost
def
iter_predict
(
Xs
,
Ms
):
logits
=
[]
with
torch
.
no_grad
():
dh_model
.
eval
()
for
xmb
,
mmb
in
iter_data
(
Xs
,
Ms
,
n_batch
=
n_batch_train
,
truncate
=
False
,
verbose
=
True
):
n
=
len
(
xmb
)
XMB
=
torch
.
tensor
(
xmb
,
dtype
=
torch
.
long
).
to
(
device
)
MMB
=
torch
.
tensor
(
mmb
).
to
(
device
)
_
,
clf_logits
=
dh_model
(
XMB
)
logits
.
append
(
clf_logits
.
to
(
"cpu"
).
numpy
())
logits
=
np
.
concatenate
(
logits
,
0
)
return
logits
def
log
(
save_dir
,
desc
):
global
best_score
print
(
"Logging"
)
tr_logits
,
tr_cost
=
iter_apply
(
trX
[:
n_valid
],
trM
[:
n_valid
],
trY
[:
n_valid
])
va_logits
,
va_cost
=
iter_apply
(
vaX
,
vaM
,
vaY
)
tr_cost
=
tr_cost
/
len
(
trY
[:
n_valid
])
va_cost
=
va_cost
/
n_valid
tr_acc
=
accuracy_score
(
trY
[:
n_valid
],
np
.
argmax
(
tr_logits
,
1
))
*
100.
va_acc
=
accuracy_score
(
vaY
,
np
.
argmax
(
va_logits
,
1
))
*
100.
logger
.
log
(
n_epochs
=
n_epochs
,
n_updates
=
n_updates
,
tr_cost
=
tr_cost
,
va_cost
=
va_cost
,
tr_acc
=
tr_acc
,
va_acc
=
va_acc
)
print
(
'%d %d %.3f %.3f %.2f %.2f'
%
(
n_epochs
,
n_updates
,
tr_cost
,
va_cost
,
tr_acc
,
va_acc
))
if
submit
:
score
=
va_acc
if
score
>
best_score
:
best_score
=
score
path
=
os
.
path
.
join
(
save_dir
,
desc
,
'best_params'
)
torch
.
save
(
dh_model
.
state_dict
(),
make_path
(
path
))
def
predict
(
dataset
,
submission_dir
):
filename
=
filenames
[
dataset
]
pred_fn
=
pred_fns
[
dataset
]
label_decoder
=
label_decoders
[
dataset
]
predictions
=
pred_fn
(
iter_predict
(
teX
,
teM
))
if
label_decoder
is
not
None
:
predictions
=
[
label_decoder
[
prediction
]
for
prediction
in
predictions
]
path
=
os
.
path
.
join
(
submission_dir
,
filename
)
os
.
makedirs
(
os
.
path
.
dirname
(
path
),
exist_ok
=
True
)
with
open
(
path
,
'w'
)
as
f
:
f
.
write
(
'{}
\t
{}
\n
'
.
format
(
'index'
,
'prediction'
))
for
i
,
prediction
in
enumerate
(
predictions
):
f
.
write
(
'{}
\t
{}
\n
'
.
format
(
i
,
prediction
))
def
run_epoch
():
for
xmb
,
mmb
,
ymb
in
iter_data
(
*
shuffle
(
trX
,
trM
,
trYt
,
random_state
=
np
.
random
),
n_batch
=
n_batch_train
,
truncate
=
True
,
verbose
=
True
):
global
n_updates
dh_model
.
train
()
XMB
=
torch
.
tensor
(
xmb
,
dtype
=
torch
.
long
).
to
(
device
)
YMB
=
torch
.
tensor
(
ymb
,
dtype
=
torch
.
long
).
to
(
device
)
MMB
=
torch
.
tensor
(
mmb
).
to
(
device
)
lm_logits
,
clf_logits
=
dh_model
(
XMB
)
compute_loss_fct
(
XMB
,
YMB
,
MMB
,
clf_logits
,
lm_logits
)
n_updates
+=
1
if
n_updates
in
[
1000
,
2000
,
4000
,
8000
,
16000
,
32000
]
and
n_epochs
==
0
:
log
(
save_dir
,
desc
)
argmax
=
lambda
x
:
np
.
argmax
(
x
,
1
)
pred_fns
=
{
'rocstories'
:
argmax
,
}
filenames
=
{
'rocstories'
:
'ROCStories.tsv'
,
}
label_decoders
=
{
'rocstories'
:
None
,
}
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--desc'
,
type
=
str
,
help
=
"Description"
)
parser
.
add_argument
(
'--dataset'
,
type
=
str
)
parser
.
add_argument
(
'--log_dir'
,
type
=
str
,
default
=
'log/'
)
parser
.
add_argument
(
'--save_dir'
,
type
=
str
,
default
=
'save/'
)
parser
.
add_argument
(
'--data_dir'
,
type
=
str
,
default
=
'data/'
)
parser
.
add_argument
(
'--submission_dir'
,
type
=
str
,
default
=
'submission/'
)
parser
.
add_argument
(
'--submit'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--analysis'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
42
)
parser
.
add_argument
(
'--n_iter'
,
type
=
int
,
default
=
3
)
parser
.
add_argument
(
'--n_batch'
,
type
=
int
,
default
=
8
)
parser
.
add_argument
(
'--max_grad_norm'
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
'--lr'
,
type
=
float
,
default
=
6.25e-5
)
parser
.
add_argument
(
'--lr_warmup'
,
type
=
float
,
default
=
0.002
)
parser
.
add_argument
(
'--n_ctx'
,
type
=
int
,
default
=
512
)
parser
.
add_argument
(
'--n_embd'
,
type
=
int
,
default
=
768
)
parser
.
add_argument
(
'--n_head'
,
type
=
int
,
default
=
12
)
parser
.
add_argument
(
'--n_layer'
,
type
=
int
,
default
=
12
)
parser
.
add_argument
(
'--embd_pdrop'
,
type
=
float
,
default
=
0.1
)
parser
.
add_argument
(
'--attn_pdrop'
,
type
=
float
,
default
=
0.1
)
parser
.
add_argument
(
'--resid_pdrop'
,
type
=
float
,
default
=
0.1
)
parser
.
add_argument
(
'--clf_pdrop'
,
type
=
float
,
default
=
0.1
)
parser
.
add_argument
(
'--l2'
,
type
=
float
,
default
=
0.01
)
parser
.
add_argument
(
'--vector_l2'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--opt'
,
type
=
str
,
default
=
'adam'
)
parser
.
add_argument
(
'--afn'
,
type
=
str
,
default
=
'gelu'
)
parser
.
add_argument
(
'--lr_schedule'
,
type
=
str
,
default
=
'warmup_linear'
)
parser
.
add_argument
(
'--encoder_path'
,
type
=
str
,
default
=
'model/encoder_bpe_40000.json'
)
parser
.
add_argument
(
'--bpe_path'
,
type
=
str
,
default
=
'model/vocab_40000.bpe'
)
parser
.
add_argument
(
'--n_transfer'
,
type
=
int
,
default
=
12
)
parser
.
add_argument
(
'--lm_coef'
,
type
=
float
,
default
=
0.5
)
parser
.
add_argument
(
'--b1'
,
type
=
float
,
default
=
0.9
)
parser
.
add_argument
(
'--b2'
,
type
=
float
,
default
=
0.999
)
parser
.
add_argument
(
'--e'
,
type
=
float
,
default
=
1e-8
)
parser
.
add_argument
(
'--n_valid'
,
type
=
int
,
default
=
374
)
args
=
parser
.
parse_args
()
print
(
args
)
random
.
seed
(
args
.
seed
)
np
.
random
.
seed
(
args
.
seed
)
torch
.
manual_seed
(
args
.
seed
)
torch
.
cuda
.
manual_seed_all
(
args
.
seed
)
# Constants
submit
=
args
.
submit
dataset
=
args
.
dataset
n_ctx
=
args
.
n_ctx
save_dir
=
args
.
save_dir
desc
=
args
.
desc
data_dir
=
args
.
data_dir
log_dir
=
args
.
log_dir
submission_dir
=
args
.
submission_dir
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
n_gpu
=
torch
.
cuda
.
device_count
()
print
(
"device"
,
device
,
"n_gpu"
,
n_gpu
)
logger
=
ResultLogger
(
path
=
os
.
path
.
join
(
log_dir
,
'{}.jsonl'
.
format
(
desc
)),
**
args
.
__dict__
)
text_encoder
=
TextEncoder
(
args
.
encoder_path
,
args
.
bpe_path
)
encoder
=
text_encoder
.
encoder
n_vocab
=
len
(
text_encoder
.
encoder
)
print
(
"Encoding dataset..."
)
((
trX1
,
trX2
,
trX3
,
trY
),
(
vaX1
,
vaX2
,
vaX3
,
vaY
),
(
teX1
,
teX2
,
teX3
))
=
encode_dataset
(
*
rocstories
(
data_dir
,
n_valid
=
args
.
n_valid
),
encoder
=
text_encoder
)
encoder
[
'_start_'
]
=
len
(
encoder
)
encoder
[
'_delimiter_'
]
=
len
(
encoder
)
encoder
[
'_classify_'
]
=
len
(
encoder
)
clf_token
=
encoder
[
'_classify_'
]
n_special
=
3
max_len
=
n_ctx
//
2
-
2
n_ctx
=
min
(
max
(
[
len
(
x1
[:
max_len
])
+
max
(
len
(
x2
[:
max_len
]),
len
(
x3
[:
max_len
]))
for
x1
,
x2
,
x3
in
zip
(
trX1
,
trX2
,
trX3
)]
+
[
len
(
x1
[:
max_len
])
+
max
(
len
(
x2
[:
max_len
]),
len
(
x3
[:
max_len
]))
for
x1
,
x2
,
x3
in
zip
(
vaX1
,
vaX2
,
vaX3
)]
+
[
len
(
x1
[:
max_len
])
+
max
(
len
(
x2
[:
max_len
]),
len
(
x3
[:
max_len
]))
for
x1
,
x2
,
x3
in
zip
(
teX1
,
teX2
,
teX3
)]
)
+
3
,
n_ctx
)
vocab
=
n_vocab
+
n_special
+
n_ctx
trX
,
trM
=
transform_roc
(
trX1
,
trX2
,
trX3
)
vaX
,
vaM
=
transform_roc
(
vaX1
,
vaX2
,
vaX3
)
if
submit
:
teX
,
teM
=
transform_roc
(
teX1
,
teX2
,
teX3
)
n_train
=
len
(
trY
)
n_valid
=
len
(
vaY
)
n_batch_train
=
args
.
n_batch
*
max
(
n_gpu
,
1
)
n_updates_total
=
(
n_train
//
n_batch_train
)
*
args
.
n_iter
dh_model
=
DoubleHeadModel
(
args
,
clf_token
,
'multiple_choice'
,
vocab
,
n_ctx
)
criterion
=
nn
.
CrossEntropyLoss
(
reduce
=
False
)
model_opt
=
OpenAIAdam
(
dh_model
.
parameters
(),
lr
=
args
.
lr
,
schedule
=
args
.
lr_schedule
,
warmup
=
args
.
lr_warmup
,
t_total
=
n_updates_total
,
b1
=
args
.
b1
,
b2
=
args
.
b2
,
e
=
args
.
e
,
l2
=
args
.
l2
,
vector_l2
=
args
.
vector_l2
,
max_grad_norm
=
args
.
max_grad_norm
)
compute_loss_fct
=
MultipleChoiceLossCompute
(
criterion
,
criterion
,
args
.
lm_coef
,
model_opt
)
load_openai_pretrained_model
(
dh_model
.
transformer
,
n_ctx
=
n_ctx
,
n_special
=
n_special
)
dh_model
.
to
(
device
)
dh_model
=
nn
.
DataParallel
(
dh_model
)
n_updates
=
0
n_epochs
=
0
if
dataset
!=
'stsb'
:
trYt
=
trY
if
submit
:
path
=
os
.
path
.
join
(
save_dir
,
desc
,
'best_params'
)
torch
.
save
(
dh_model
.
state_dict
(),
make_path
(
path
))
best_score
=
0
for
i
in
range
(
args
.
n_iter
):
print
(
"running epoch"
,
i
)
run_epoch
()
n_epochs
+=
1
log
(
save_dir
,
desc
)
if
submit
:
path
=
os
.
path
.
join
(
save_dir
,
desc
,
'best_params'
)
dh_model
.
load_state_dict
(
torch
.
load
(
path
))
predict
(
dataset
,
args
.
submission_dir
)
if
args
.
analysis
:
rocstories_analysis
(
data_dir
,
os
.
path
.
join
(
args
.
submission_dir
,
'ROCStories.tsv'
),
os
.
path
.
join
(
log_dir
,
'rocstories.jsonl'
))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment