Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
3cf12b23
Commit
3cf12b23
authored
Jan 08, 2019
by
thomwolf
Browse files
added tests + fixed losses
parent
eed51c5b
Changes
4
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
479 additions
and
220 deletions
+479
-220
pytorch_pretrained_bert/modeling.py
pytorch_pretrained_bert/modeling.py
+1
-1
pytorch_pretrained_bert/modeling_openai.py
pytorch_pretrained_bert/modeling_openai.py
+228
-187
pytorch_pretrained_bert/tokenization_openai.py
pytorch_pretrained_bert/tokenization_openai.py
+58
-32
tests/modeling_openai_test.py
tests/modeling_openai_test.py
+192
-0
No files found.
pytorch_pretrained_bert/modeling.py
View file @
3cf12b23
...
@@ -549,7 +549,7 @@ class BertPreTrainedModel(nn.Module):
...
@@ -549,7 +549,7 @@ class BertPreTrainedModel(nn.Module):
model
.
__class__
.
__name__
,
unexpected_keys
))
model
.
__class__
.
__name__
,
unexpected_keys
))
if
len
(
error_msgs
)
>
0
:
if
len
(
error_msgs
)
>
0
:
raise
RuntimeError
(
'Error(s) in loading state_dict for {}:
\n\t
{}'
.
format
(
raise
RuntimeError
(
'Error(s) in loading state_dict for {}:
\n\t
{}'
.
format
(
s
el
f
.
__class__
.
__name__
,
"
\n\t
"
.
join
(
error_msgs
)))
mod
el
.
__class__
.
__name__
,
"
\n\t
"
.
join
(
error_msgs
)))
if
tempdir
:
if
tempdir
:
# Clean up temp dir
# Clean up temp dir
shutil
.
rmtree
(
tempdir
)
shutil
.
rmtree
(
tempdir
)
...
...
pytorch_pretrained_bert/modeling_openai.py
View file @
3cf12b23
This diff is collapsed.
Click to expand it.
pytorch_pretrained_bert/tokenization_openai.py
View file @
3cf12b23
...
@@ -67,19 +67,17 @@ class OpenAIGPTTokenizer(object):
...
@@ -67,19 +67,17 @@ class OpenAIGPTTokenizer(object):
mostly a wrapper for a public python bpe tokenizer
mostly a wrapper for a public python bpe tokenizer
"""
"""
@
classmethod
@
classmethod
def
from_pretrained
(
cls
,
pretrained_model_name
,
cache_dir
=
None
,
*
inputs
,
**
kwargs
):
def
from_pretrained
(
cls
,
pretrained_model_name
_or_path
,
cache_dir
=
None
,
*
inputs
,
**
kwargs
):
"""
"""
Instantiate a PreTrainedBertModel from a pre-trained model file.
Instantiate a PreTrainedBertModel from a pre-trained model file.
Download and cache the pre-trained model file if needed.
Download and cache the pre-trained model file if needed.
"""
"""
if
pretrained_model_name
in
PRETRAINED_VOCAB_ARCHIVE_MAP
:
if
pretrained_model_name
_or_path
in
PRETRAINED_VOCAB_ARCHIVE_MAP
:
vocab_file
=
PRETRAINED_VOCAB_ARCHIVE_MAP
[
pretrained_model_name
]
vocab_file
=
PRETRAINED_VOCAB_ARCHIVE_MAP
[
pretrained_model_name
_or_path
]
merges_file
=
PRETRAINED_MERGES_ARCHIVE_MAP
[
pretrained_model_name
]
merges_file
=
PRETRAINED_MERGES_ARCHIVE_MAP
[
pretrained_model_name
_or_path
]
else
:
else
:
vocab_file
=
pretrained_model_name
vocab_file
=
os
.
path
.
join
(
pretrained_model_name_or_path
,
VOCAB_NAME
)
if
os
.
path
.
isdir
(
vocab_file
):
merges_file
=
os
.
path
.
join
(
pretrained_model_name_or_path
,
MERGES_NAME
)
vocab_file
=
os
.
path
.
join
(
vocab_file
,
VOCAB_NAME
)
merges_file
=
os
.
path
.
join
(
vocab_file
,
MERGES_NAME
)
# redirect to the cache, if necessary
# redirect to the cache, if necessary
try
:
try
:
resolved_vocab_file
=
cached_path
(
vocab_file
,
cache_dir
=
cache_dir
)
resolved_vocab_file
=
cached_path
(
vocab_file
,
cache_dir
=
cache_dir
)
...
@@ -87,11 +85,12 @@ class OpenAIGPTTokenizer(object):
...
@@ -87,11 +85,12 @@ class OpenAIGPTTokenizer(object):
except
FileNotFoundError
:
except
FileNotFoundError
:
logger
.
error
(
logger
.
error
(
"Model name '{}' was not found in model name list ({}). "
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find
any
file "
"We assumed '{}' was a path or url but couldn't find file
s {} and {}
"
"a
ssociated to
this path or url."
.
format
(
"a
t
this path or url."
.
format
(
pretrained_model_name
,
pretrained_model_name
_or_path
,
', '
.
join
(
PRETRAINED_VOCAB_ARCHIVE_MAP
.
keys
()),
', '
.
join
(
PRETRAINED_VOCAB_ARCHIVE_MAP
.
keys
()),
vocab_file
))
pretrained_model_name_or_path
,
vocab_file
,
merges_file
))
return
None
return
None
if
resolved_vocab_file
==
vocab_file
and
resolved_merges_file
==
merges_file
:
if
resolved_vocab_file
==
vocab_file
and
resolved_merges_file
==
merges_file
:
logger
.
info
(
"loading vocabulary file {}"
.
format
(
vocab_file
))
logger
.
info
(
"loading vocabulary file {}"
.
format
(
vocab_file
))
...
@@ -101,29 +100,38 @@ class OpenAIGPTTokenizer(object):
...
@@ -101,29 +100,38 @@ class OpenAIGPTTokenizer(object):
vocab_file
,
resolved_vocab_file
))
vocab_file
,
resolved_vocab_file
))
logger
.
info
(
"loading merges file {} from cache at {}"
.
format
(
logger
.
info
(
"loading merges file {} from cache at {}"
.
format
(
merges_file
,
resolved_merges_file
))
merges_file
,
resolved_merges_file
))
if
pretrained_model_name
in
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
:
if
pretrained_model_name
_or_path
in
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
:
# if we're using a pretrained model, ensure the tokenizer wont index sequences longer
# if we're using a pretrained model, ensure the tokenizer wont index sequences longer
# than the number of positional embeddings
# than the number of positional embeddings
max_len
=
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
[
pretrained_model_name
]
max_len
=
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
[
pretrained_model_name
_or_path
]
kwargs
[
'max_len'
]
=
min
(
kwargs
.
get
(
'max_len'
,
int
(
1e12
)),
max_len
)
kwargs
[
'max_len'
]
=
min
(
kwargs
.
get
(
'max_len'
,
int
(
1e12
)),
max_len
)
# Instantiate tokenizer.
# Instantiate tokenizer.
tokenizer
=
cls
(
resolved_vocab_file
,
resolved_merges_file
,
*
inputs
,
**
kwargs
)
tokenizer
=
cls
(
resolved_vocab_file
,
resolved_merges_file
,
*
inputs
,
**
kwargs
)
return
tokenizer
return
tokenizer
def
__init__
(
self
,
vocab_file
,
merges_file
):
def
__init__
(
self
,
vocab_file
,
merges_file
,
special_tokens
=
None
,
max_len
=
None
):
try
:
try
:
import
ftfy
import
ftfy
import
spacy
import
spacy
except
ImportError
:
except
ImportError
:
raise
ImportError
(
"Please install ftfy and spacy to use OpenAI GPT tokenizer."
)
raise
ImportError
(
"Please install ftfy and spacy to use OpenAI GPT tokenizer."
)
self
.
max_len
=
max_len
if
max_len
is
not
None
else
int
(
1e12
)
self
.
nlp
=
spacy
.
load
(
'en'
,
disable
=
[
'parser'
,
'tagger'
,
'ner'
,
'textcat'
])
self
.
nlp
=
spacy
.
load
(
'en'
,
disable
=
[
'parser'
,
'tagger'
,
'ner'
,
'textcat'
])
self
.
fix_text
=
ftfy
.
fix_text
self
.
encoder
=
json
.
load
(
open
(
vocab_file
))
self
.
encoder
=
json
.
load
(
open
(
vocab_file
))
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
merges
=
open
(
merges_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[
1
:
-
1
]
merges
=
open
(
merges_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[
1
:
-
1
]
merges
=
[
tuple
(
merge
.
split
())
for
merge
in
merges
]
merges
=
[
tuple
(
merge
.
split
())
for
merge
in
merges
]
self
.
bpe_ranks
=
dict
(
zip
(
merges
,
range
(
len
(
merges
))))
self
.
bpe_ranks
=
dict
(
zip
(
merges
,
range
(
len
(
merges
))))
self
.
cache
=
{}
self
.
cache
=
{}
if
not
special_tokens
:
self
.
special_tokens
=
{}
else
:
self
.
special_tokens
=
dict
((
tok
,
len
(
self
.
encoder
)
+
i
)
for
i
,
tok
in
enumerate
(
special_tokens
))
def
set_special_tokens
(
self
,
special_tokens
):
self
.
special_tokens
=
dict
((
tok
,
len
(
self
.
encoder
)
+
i
)
for
i
,
tok
in
enumerate
(
special_tokens
))
def
bpe
(
self
,
token
):
def
bpe
(
self
,
token
):
word
=
tuple
(
token
[:
-
1
])
+
(
token
[
-
1
]
+
'</w>'
,)
word
=
tuple
(
token
[:
-
1
])
+
(
token
[
-
1
]
+
'</w>'
,)
...
@@ -168,20 +176,38 @@ class OpenAIGPTTokenizer(object):
...
@@ -168,20 +176,38 @@ class OpenAIGPTTokenizer(object):
self
.
cache
[
token
]
=
word
self
.
cache
[
token
]
=
word
return
word
return
word
def
tokenize
(
self
,
texts
,
verbose
=
True
):
def
tokenize
(
self
,
text
):
texts_tokens
=
[]
split_tokens
=
[]
if
verbose
:
text
=
self
.
nlp
(
text_standardize
(
self
.
fix_text
(
text
)))
for
text
in
tqdm
(
texts
,
ncols
=
80
,
leave
=
False
):
for
token
in
text
:
text
=
self
.
nlp
(
text_standardize
(
ftfy
.
fix_text
(
text
)))
split_tokens
.
extend
([
t
for
t
in
self
.
bpe
(
token
.
text
.
lower
()).
split
(
' '
)])
text_tokens
=
[]
return
split_tokens
for
token
in
text
:
text_tokens
.
extend
([
self
.
encoder
.
get
(
t
,
0
)
for
t
in
self
.
bpe
(
token
.
text
.
lower
()).
split
(
' '
)])
def
convert_tokens_to_ids
(
self
,
tokens
):
texts_tokens
.
append
(
text_tokens
)
"""Converts a sequence of tokens into ids using the vocab."""
else
:
ids
=
[]
for
text
in
texts
:
for
token
in
tokens
:
text
=
self
.
nlp
(
text_standardize
(
ftfy
.
fix_text
(
text
)))
if
token
in
self
.
special_tokens
:
text_tokens
=
[]
ids
.
append
(
self
.
special_tokens
[
token
])
for
token
in
text
:
else
:
text_tokens
.
extend
([
self
.
encoder
.
get
(
t
,
0
)
for
t
in
self
.
bpe
(
token
.
text
.
lower
()).
split
(
' '
)])
ids
.
append
(
self
.
encoder
.
get
(
token
,
0
))
texts_tokens
.
append
(
text_tokens
)
if
len
(
ids
)
>
self
.
max_len
:
return
texts_tokens
raise
ValueError
(
"Token indices sequence length is longer than the specified maximum "
" sequence length for this BERT model ({} > {}). Running this"
" sequence through BERT will result in indexing errors"
.
format
(
len
(
ids
),
self
.
max_len
)
)
return
ids
def
convert_ids_to_tokens
(
self
,
ids
):
"""Converts a sequence of ids in BPE tokens using the vocab."""
tokens
=
[]
for
i
in
ids
:
tokens
.
append
(
self
.
decoder
[
i
])
return
tokens
def
decode
(
self
,
ids
):
"""Converts a sequence of ids in a string."""
tokens
=
self
.
convert_ids_to_tokens
(
ids
)
out_string
=
''
.
join
(
tokens
).
replace
(
'</w>'
,
' '
)
return
out_string
tests/modeling_openai_test.py
0 → 100644
View file @
3cf12b23
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
unittest
import
json
import
random
import
torch
from
pytorch_pretrained_bert
import
(
OpenAIGPTConfig
,
OpenAIGPTModel
,
OpenAIGPTDoubleHeadsModel
)
class
OpenAIGPTModelTest
(
unittest
.
TestCase
):
class
OpenAIGPTModelTester
(
object
):
def
__init__
(
self
,
parent
,
batch_size
=
13
,
seq_length
=
7
,
is_training
=
True
,
use_position_ids
=
True
,
use_token_type_ids
=
True
,
use_labels
=
True
,
vocab_size
=
99
,
n_special
=
1
,
n_ctx
=
33
,
n_embd
=
32
,
n_layer
=
5
,
n_head
=
4
,
n_choices
=
3
,
afn
=
"gelu"
,
resid_pdrop
=
0.1
,
attn_pdrop
=
0.1
,
embd_pdrop
=
0.1
,
type_sequence_label_size
=
2
,
initializer_range
=
0.02
,
num_labels
=
3
,
scope
=
None
):
self
.
parent
=
parent
self
.
batch_size
=
batch_size
self
.
seq_length
=
seq_length
self
.
is_training
=
is_training
self
.
use_position_ids
=
use_position_ids
self
.
use_token_type_ids
=
use_token_type_ids
self
.
use_labels
=
use_labels
self
.
vocab_size
=
vocab_size
self
.
n_special
=
n_special
self
.
n_ctx
=
n_ctx
self
.
n_embd
=
n_embd
self
.
n_layer
=
n_layer
self
.
n_head
=
n_head
self
.
afn
=
afn
self
.
n_choices
=
n_choices
self
.
resid_pdrop
=
resid_pdrop
self
.
attn_pdrop
=
attn_pdrop
self
.
embd_pdrop
=
embd_pdrop
self
.
type_sequence_label_size
=
type_sequence_label_size
self
.
initializer_range
=
initializer_range
self
.
num_labels
=
num_labels
self
.
scope
=
scope
def
prepare_config_and_inputs
(
self
):
input_ids
=
OpenAIGPTModelTest
.
ids_tensor
([
self
.
batch_size
,
self
.
n_choices
,
self
.
seq_length
],
self
.
vocab_size
)
position_ids
=
None
if
self
.
use_position_ids
:
position_ids
=
OpenAIGPTModelTest
.
ids_tensor
([
self
.
batch_size
,
self
.
n_choices
,
self
.
seq_length
],
self
.
n_ctx
)
position_ids
=
position_ids
+
self
.
n_special
+
self
.
vocab_size
token_type_ids
=
None
if
self
.
use_token_type_ids
:
total_voc
=
self
.
n_ctx
+
self
.
n_special
+
self
.
vocab_size
token_type_ids
=
OpenAIGPTModelTest
.
ids_tensor
([
self
.
batch_size
,
self
.
n_choices
,
self
.
seq_length
],
total_voc
)
multiple_choice_labels
=
None
lm_labels
=
None
classification_token_mask
=
None
if
self
.
use_labels
:
multiple_choice_labels
=
OpenAIGPTModelTest
.
ids_tensor
([
self
.
batch_size
],
self
.
type_sequence_label_size
)
lm_labels
=
OpenAIGPTModelTest
.
ids_tensor
([
self
.
batch_size
,
self
.
n_choices
,
self
.
seq_length
],
self
.
num_labels
)
classification_token_mask
=
OpenAIGPTModelTest
.
ids_tensor
([
self
.
batch_size
,
self
.
n_choices
,
self
.
seq_length
],
2
).
float
()
config
=
OpenAIGPTConfig
(
vocab_size_or_config_json_file
=
self
.
vocab_size
,
n_ctx
=
self
.
n_ctx
,
n_special
=
self
.
n_special
,
n_embd
=
self
.
n_embd
,
n_layer
=
self
.
n_layer
,
n_head
=
self
.
n_head
,
afn
=
self
.
afn
,
resid_pdrop
=
self
.
resid_pdrop
,
attn_pdrop
=
self
.
attn_pdrop
,
embd_pdrop
=
self
.
embd_pdrop
,
initializer_range
=
self
.
initializer_range
)
return
(
config
,
input_ids
,
token_type_ids
,
position_ids
,
multiple_choice_labels
,
lm_labels
,
classification_token_mask
)
def
create_openai_model
(
self
,
config
,
input_ids
,
token_type_ids
,
position_ids
,
multiple_choice_labels
,
lm_labels
,
classification_token_mask
):
model
=
OpenAIGPTModel
(
config
)
hidden_states
=
model
(
input_ids
,
position_ids
,
token_type_ids
)
outputs
=
{
"hidden_states"
:
hidden_states
,
}
return
outputs
def
check_openai_model_output
(
self
,
result
):
self
.
parent
.
assertListEqual
(
list
(
result
[
"hidden_states"
].
size
()),
[
self
.
batch_size
,
self
.
n_choices
,
self
.
seq_length
,
self
.
n_embd
])
def
create_openai_double_heads
(
self
,
config
,
input_ids
,
token_type_ids
,
position_ids
,
multiple_choice_labels
,
lm_labels
,
classification_token_mask
):
model
=
OpenAIGPTDoubleHeadsModel
(
config
)
loss
=
model
(
input_ids
,
classification_token_mask
,
position_ids
,
token_type_ids
,
lm_labels
,
multiple_choice_labels
)
lm_logits
,
multiple_choice_logits
=
model
(
input_ids
,
classification_token_mask
,
position_ids
,
token_type_ids
)
outputs
=
{
"loss"
:
loss
,
"lm_logits"
:
lm_logits
,
"multiple_choice_logits"
:
multiple_choice_logits
,
}
return
outputs
def
check_openai_double_heads_output
(
self
,
result
):
total_voc
=
self
.
n_ctx
+
self
.
n_special
+
self
.
vocab_size
self
.
parent
.
assertListEqual
(
list
(
result
[
"lm_logits"
].
size
()),
[
self
.
batch_size
,
self
.
n_choices
,
self
.
seq_length
,
total_voc
])
self
.
parent
.
assertListEqual
(
list
(
result
[
"multiple_choice_logits"
].
size
()),
[
self
.
batch_size
,
self
.
n_choices
])
def
check_openai_double_heads_loss_output
(
self
,
result
):
self
.
parent
.
assertListEqual
(
[
list
(
l
.
size
())
for
l
in
result
[
"loss"
]],
[[],
[]])
def
test_default
(
self
):
self
.
run_tester
(
OpenAIGPTModelTest
.
OpenAIGPTModelTester
(
self
))
def
test_config_to_json_string
(
self
):
config
=
OpenAIGPTConfig
(
vocab_size_or_config_json_file
=
99
,
n_embd
=
37
)
obj
=
json
.
loads
(
config
.
to_json_string
())
self
.
assertEqual
(
obj
[
"vocab_size"
],
99
)
self
.
assertEqual
(
obj
[
"n_embd"
],
37
)
def
run_tester
(
self
,
tester
):
config_and_inputs
=
tester
.
prepare_config_and_inputs
()
output_result
=
tester
.
create_openai_model
(
*
config_and_inputs
)
tester
.
check_openai_model_output
(
output_result
)
output_result
=
tester
.
create_openai_double_heads
(
*
config_and_inputs
)
tester
.
check_openai_double_heads_output
(
output_result
)
tester
.
check_openai_double_heads_loss_output
(
output_result
)
@
classmethod
def
ids_tensor
(
cls
,
shape
,
vocab_size
,
rng
=
None
,
name
=
None
):
"""Creates a random int32 tensor of the shape within the vocab size."""
if
rng
is
None
:
rng
=
random
.
Random
()
total_dims
=
1
for
dim
in
shape
:
total_dims
*=
dim
values
=
[]
for
_
in
range
(
total_dims
):
values
.
append
(
rng
.
randint
(
0
,
vocab_size
-
1
))
return
torch
.
tensor
(
data
=
values
,
dtype
=
torch
.
long
).
view
(
shape
).
contiguous
()
if
__name__
==
"__main__"
:
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment