Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
009ee86a
"examples/vscode:/vscode.git/clone" did not exist on "29a392fbcfd8bae54bcf732d7a6b508839900458"
Commit
009ee86a
authored
Feb 17, 2019
by
thomwolf
Browse files
fix tests - bump up version
parent
ffd62382
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
28 additions
and
79 deletions
+28
-79
pytorch_pretrained_bert/__init__.py
pytorch_pretrained_bert/__init__.py
+1
-1
pytorch_pretrained_bert/modeling_gpt2.py
pytorch_pretrained_bert/modeling_gpt2.py
+22
-14
pytorch_pretrained_bert/modeling_openai.py
pytorch_pretrained_bert/modeling_openai.py
+1
-1
setup.py
setup.py
+1
-1
tests/modeling_gpt2_test.py
tests/modeling_gpt2_test.py
+3
-6
tests/tokenization_gpt2_test.py
tests/tokenization_gpt2_test.py
+0
-56
No files found.
pytorch_pretrained_bert/__init__.py
View file @
009ee86a
__version__
=
"0.
5.1
"
__version__
=
"0.
6.0
"
from
.tokenization
import
BertTokenizer
,
BasicTokenizer
,
WordpieceTokenizer
from
.tokenization_openai
import
OpenAIGPTTokenizer
from
.tokenization_transfo_xl
import
(
TransfoXLTokenizer
,
TransfoXLCorpus
)
...
...
pytorch_pretrained_bert/modeling_gpt2.py
View file @
009ee86a
...
...
@@ -64,20 +64,24 @@ def load_tf_weights_in_gpt2(model, gpt2_checkpoint_path):
print
(
"Loading TF weight {} with shape {}"
.
format
(
name
,
shape
))
array
=
tf
.
train
.
load_variable
(
tf_path
,
name
)
names
.
append
(
name
)
arrays
.
append
(
array
)
arrays
.
append
(
array
.
squeeze
()
)
for
name
,
array
in
zip
(
names
,
arrays
):
name
=
name
[
6
:]
# skip "model/"
name
=
name
.
split
(
'/'
)
pointer
=
model
for
m_name
in
name
:
if
re
.
fullmatch
(
r
'[A-Za-z]+
_
\d+'
,
m_name
):
l
=
re
.
split
(
r
'
_
(\d+)'
,
m_name
)
if
re
.
fullmatch
(
r
'[A-Za-z]+\d+'
,
m_name
):
l
=
re
.
split
(
r
'(\d+)'
,
m_name
)
else
:
l
=
[
m_name
]
if
l
[
0
]
==
'w'
or
l
[
0
]
==
'g'
:
pointer
=
getattr
(
pointer
,
'weight'
)
elif
l
[
0
]
==
'b'
:
pointer
=
getattr
(
pointer
,
'bias'
)
elif
l
[
0
]
==
'wpe'
or
l
[
0
]
==
'wte'
:
pointer
=
getattr
(
pointer
,
l
[
0
])
pointer
=
getattr
(
pointer
,
'weight'
)
else
:
pointer
=
getattr
(
pointer
,
l
[
0
])
if
len
(
l
)
>=
2
:
...
...
@@ -107,7 +111,7 @@ class GPT2Config(object):
def
__init__
(
self
,
vocab_size_or_config_json_file
=
40478
,
vocab_size_or_config_json_file
=
50257
,
n_positions
=
1024
,
n_ctx
=
1024
,
n_embd
=
768
,
...
...
@@ -273,10 +277,10 @@ class Block(nn.Module):
self
.
ln_2
=
LayerNorm
(
nx
,
eps
=
config
.
layer_norm_epsilon
)
self
.
mlp
=
MLP
(
4
*
nx
,
config
)
def
forward
(
self
,
x
,
past
):
def
forward
(
self
,
x
,
past
=
None
):
a
,
present
=
self
.
attn
(
self
.
ln_1
(
x
),
past
=
past
)
x
=
x
+
a
m
=
self
.
mlp
(
self
.
ln_2
(
c
))
m
=
self
.
mlp
(
self
.
ln_2
(
x
))
x
=
x
+
m
return
x
,
present
...
...
@@ -522,8 +526,12 @@ class GPT2Model(GPT2PreTrainedModel):
self
.
apply
(
self
.
init_weights
)
def
forward
(
self
,
input_ids
,
position_ids
=
None
,
token_type_ids
=
None
,
past
=
None
):
past_length
=
0
if
past
is
None
else
past
[
0
][
0
].
size
(
-
2
)
def
forward
(
self
,
input_ids
,
position_ids
=
None
,
token_type_ids
=
None
,
pasts
=
None
):
if
pasts
is
None
:
past_length
=
0
pasts
=
[
None
]
*
len
(
self
.
h
)
else
:
pasts
[
0
][
0
].
size
(
-
2
)
if
position_ids
is
None
:
position_ids
=
torch
.
arange
(
past_length
,
input_ids
.
size
(
-
1
)
+
past_length
,
dtype
=
torch
.
long
,
device
=
input_ids
.
device
)
position_ids
=
position_ids
.
unsqueeze
(
0
).
expand_as
(
input_ids
)
...
...
@@ -541,8 +549,8 @@ class GPT2Model(GPT2PreTrainedModel):
token_type_embeds
=
0
hidden_states
=
inputs_embeds
+
position_embeds
+
token_type_embeds
presents
=
[]
for
block
in
self
.
h
:
hidden_states
,
present
=
block
(
hidden_states
)
for
block
,
past
in
zip
(
self
.
h
,
pasts
)
:
hidden_states
,
present
=
block
(
hidden_states
,
past
)
presents
.
append
(
present
)
hidden_states
=
self
.
ln_f
(
hidden_states
)
output_shape
=
input_shape
+
(
hidden_states
.
size
(
-
1
),)
...
...
@@ -599,8 +607,8 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
"""
self
.
lm_head
.
set_embeddings_weights
(
self
.
transformer
.
wte
.
weight
)
def
forward
(
self
,
input_ids
,
position_ids
=
None
,
token_type_ids
=
None
,
lm_labels
=
None
,
past
=
None
):
hidden_states
,
presents
=
self
.
transformer
(
input_ids
,
position_ids
,
token_type_ids
,
past
)
def
forward
(
self
,
input_ids
,
position_ids
=
None
,
token_type_ids
=
None
,
lm_labels
=
None
,
past
s
=
None
):
hidden_states
,
presents
=
self
.
transformer
(
input_ids
,
position_ids
,
token_type_ids
,
past
s
)
lm_logits
=
self
.
lm_head
(
hidden_states
)
if
lm_labels
is
not
None
:
loss_fct
=
CrossEntropyLoss
(
ignore_index
=-
1
)
...
...
@@ -665,8 +673,8 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
"""
self
.
lm_head
.
set_embeddings_weights
(
self
.
transformer
.
wte
.
weight
)
def
forward
(
self
,
input_ids
,
mc_token_ids
,
lm_labels
=
None
,
mc_labels
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
past
=
None
):
hidden_states
,
presents
=
self
.
transformer
(
input_ids
,
position_ids
,
token_type_ids
,
past
)
def
forward
(
self
,
input_ids
,
mc_token_ids
,
lm_labels
=
None
,
mc_labels
=
None
,
token_type_ids
=
None
,
position_ids
=
None
,
past
s
=
None
):
hidden_states
,
presents
=
self
.
transformer
(
input_ids
,
position_ids
,
token_type_ids
,
past
s
)
lm_logits
=
self
.
lm_head
(
hidden_states
)
mc_logits
=
self
.
multiple_choice_head
(
hidden_states
,
mc_token_ids
)
losses
=
[]
...
...
pytorch_pretrained_bert/modeling_openai.py
View file @
009ee86a
...
...
@@ -56,7 +56,7 @@ def load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path):
init_params
=
np
.
split
(
np
.
concatenate
(
init_params
,
0
),
offsets
)[:
-
1
]
init_params
=
[
param
.
reshape
(
shape
)
for
param
,
shape
in
zip
(
init_params
,
shapes
)]
# Th
si
as used when we had a single embedding matrix for positions and tokens
# Th
is w
as used when we had a single embedding matrix for positions and tokens
# init_params[0] = np.concatenate([init_params[1], init_params[0]], 0)
# del init_params[1]
init_params
=
[
arr
.
squeeze
()
for
arr
in
init_params
]
...
...
setup.py
View file @
009ee86a
...
...
@@ -38,7 +38,7 @@ from setuptools import find_packages, setup
setup
(
name
=
"pytorch_pretrained_bert"
,
version
=
"0.
5.1
"
,
version
=
"0.
6.0
"
,
author
=
"Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors"
,
author_email
=
"thomas@huggingface.co"
,
description
=
"PyTorch version of Google AI BERT model with script to load Google pre-trained models"
,
...
...
tests/modeling_gpt2_test.py
View file @
009ee86a
...
...
@@ -38,7 +38,6 @@ class GPT2ModelTest(unittest.TestCase):
use_token_type_ids
=
True
,
use_labels
=
True
,
vocab_size
=
99
,
n_special
=
1
,
n_positions
=
33
,
n_embd
=
32
,
n_layer
=
5
,
...
...
@@ -56,7 +55,6 @@ class GPT2ModelTest(unittest.TestCase):
self
.
use_token_type_ids
=
use_token_type_ids
self
.
use_labels
=
use_labels
self
.
vocab_size
=
vocab_size
self
.
n_special
=
n_special
self
.
n_positions
=
n_positions
self
.
n_embd
=
n_embd
self
.
n_layer
=
n_layer
...
...
@@ -76,7 +74,7 @@ class GPT2ModelTest(unittest.TestCase):
token_type_ids
=
None
if
self
.
use_token_type_ids
:
total_voc
=
self
.
vocab_size
+
self
.
n_special
total_voc
=
self
.
vocab_size
token_type_ids
=
GPT2ModelTest
.
ids_tensor
([
self
.
batch_size
,
self
.
n_choices
,
self
.
seq_length
],
total_voc
)
mc_labels
=
None
...
...
@@ -90,7 +88,6 @@ class GPT2ModelTest(unittest.TestCase):
config
=
GPT2Config
(
vocab_size_or_config_json_file
=
self
.
vocab_size
,
n_positions
=
self
.
n_positions
,
n_special
=
self
.
n_special
,
n_embd
=
self
.
n_embd
,
n_layer
=
self
.
n_layer
,
n_head
=
self
.
n_head
,
...
...
@@ -130,7 +127,7 @@ class GPT2ModelTest(unittest.TestCase):
return
outputs
def
check_gpt2_lm_head_output
(
self
,
result
):
total_voc
=
self
.
n_special
+
self
.
vocab_size
total_voc
=
self
.
vocab_size
self
.
parent
.
assertListEqual
(
list
(
result
[
"lm_logits"
].
size
()),
[
self
.
batch_size
,
self
.
n_choices
,
self
.
seq_length
,
total_voc
])
...
...
@@ -157,7 +154,7 @@ class GPT2ModelTest(unittest.TestCase):
return
outputs
def
check_gpt2_double_heads_output
(
self
,
result
):
total_voc
=
self
.
n_special
+
self
.
vocab_size
total_voc
=
self
.
vocab_size
self
.
parent
.
assertListEqual
(
list
(
result
[
"lm_logits"
].
size
()),
[
self
.
batch_size
,
self
.
n_choices
,
self
.
seq_length
,
total_voc
])
...
...
tests/tokenization_gpt2_test.py
deleted
100644 → 0
View file @
ffd62382
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
import
json
from
pytorch_pretrained_bert.tokenization_gpt2
import
GPT2Tokenizer
class
GPT2TokenizationTest
(
unittest
.
TestCase
):
def
test_full_tokenizer
(
self
):
""" Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
vocab
=
[
"l"
,
"o"
,
"w"
,
"e"
,
"r"
,
"s"
,
"t"
,
"i"
,
"d"
,
"n"
,
"w</w>"
,
"r</w>"
,
"t</w>"
,
"lo"
,
"low"
,
"er</w>"
,
"low</w>"
,
"lowest</w>"
,
"newer</w>"
,
"wider</w>"
]
vocab_tokens
=
dict
(
zip
(
vocab
,
range
(
len
(
vocab
))))
merges
=
[
"#version: 0.2"
,
"l o"
,
"lo w"
,
"e r</w>"
,
""
]
with
open
(
"/tmp/openai_tokenizer_vocab_test.json"
,
"w"
)
as
fp
:
json
.
dump
(
vocab_tokens
,
fp
)
vocab_file
=
fp
.
name
with
open
(
"/tmp/openai_tokenizer_merges_test.txt"
,
"w"
)
as
fp
:
fp
.
write
(
"
\n
"
.
join
(
merges
))
merges_file
=
fp
.
name
tokenizer
=
GPT2Tokenizer
(
vocab_file
,
merges_file
)
os
.
remove
(
vocab_file
)
os
.
remove
(
merges_file
)
text
=
"lower"
bpe_tokens
=
[
"low"
,
"er</w>"
]
tokens
=
tokenizer
.
tokenize
(
text
)
self
.
assertListEqual
(
tokens
,
bpe_tokens
)
input_tokens
=
tokens
input_bpe_tokens
=
[
14
,
15
,
20
]
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
input_tokens
),
input_bpe_tokens
)
if
__name__
==
'__main__'
:
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment