Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
69da972a
"...git@developer.sourcefind.cn:wangsen/paddle_dbnet.git" did not exist on "243f89da21e4791b0cae17f934c196c39d7ee706"
Commit
69da972a
authored
Aug 30, 2019
by
thomwolf
Browse files
added test and debug tokenizer configuration serialization
parent
88111de0
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
32 additions
and
18 deletions
+32
-18
pytorch_transformers/tests/tokenization_bert_test.py
pytorch_transformers/tests/tokenization_bert_test.py
+2
-2
pytorch_transformers/tests/tokenization_gpt2_test.py
pytorch_transformers/tests/tokenization_gpt2_test.py
+3
-2
pytorch_transformers/tests/tokenization_openai_test.py
pytorch_transformers/tests/tokenization_openai_test.py
+2
-2
pytorch_transformers/tests/tokenization_roberta_test.py
pytorch_transformers/tests/tokenization_roberta_test.py
+3
-2
pytorch_transformers/tests/tokenization_tests_commons.py
pytorch_transformers/tests/tokenization_tests_commons.py
+12
-3
pytorch_transformers/tests/tokenization_transfo_xl_test.py
pytorch_transformers/tests/tokenization_transfo_xl_test.py
+3
-2
pytorch_transformers/tests/tokenization_xlm_test.py
pytorch_transformers/tests/tokenization_xlm_test.py
+2
-2
pytorch_transformers/tests/tokenization_xlnet_test.py
pytorch_transformers/tests/tokenization_xlnet_test.py
+2
-2
pytorch_transformers/tokenization_utils.py
pytorch_transformers/tokenization_utils.py
+3
-1
No files found.
pytorch_transformers/tests/tokenization_bert_test.py
View file @
69da972a
...
...
@@ -41,8 +41,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
with
open
(
self
.
vocab_file
,
"w"
,
encoding
=
'utf-8'
)
as
vocab_writer
:
vocab_writer
.
write
(
""
.
join
([
x
+
"
\n
"
for
x
in
vocab_tokens
]))
def
get_tokenizer
(
self
):
return
BertTokenizer
.
from_pretrained
(
self
.
tmpdirname
)
def
get_tokenizer
(
self
,
**
kwargs
):
return
BertTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
kwargs
)
def
get_input_output_texts
(
self
):
input_text
=
u
"UNwant
\u00E9
d,running"
...
...
pytorch_transformers/tests/tokenization_gpt2_test.py
View file @
69da972a
...
...
@@ -44,8 +44,9 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
with
open
(
self
.
merges_file
,
"w"
)
as
fp
:
fp
.
write
(
"
\n
"
.
join
(
merges
))
def
get_tokenizer
(
self
):
return
GPT2Tokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
self
.
special_tokens_map
)
def
get_tokenizer
(
self
,
**
kwargs
):
kwargs
.
update
(
self
.
special_tokens_map
)
return
GPT2Tokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
kwargs
)
def
get_input_output_texts
(
self
):
input_text
=
u
"lower newer"
...
...
pytorch_transformers/tests/tokenization_openai_test.py
View file @
69da972a
...
...
@@ -45,8 +45,8 @@ class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester):
with
open
(
self
.
merges_file
,
"w"
)
as
fp
:
fp
.
write
(
"
\n
"
.
join
(
merges
))
def
get_tokenizer
(
self
):
return
OpenAIGPTTokenizer
.
from_pretrained
(
self
.
tmpdirname
)
def
get_tokenizer
(
self
,
**
kwargs
):
return
OpenAIGPTTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
kwargs
)
def
get_input_output_texts
(
self
):
input_text
=
u
"lower newer"
...
...
pytorch_transformers/tests/tokenization_roberta_test.py
View file @
69da972a
...
...
@@ -43,8 +43,9 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
with
open
(
self
.
merges_file
,
"w"
)
as
fp
:
fp
.
write
(
"
\n
"
.
join
(
merges
))
def
get_tokenizer
(
self
):
return
RobertaTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
self
.
special_tokens_map
)
def
get_tokenizer
(
self
,
**
kwargs
):
kwargs
.
update
(
self
.
special_tokens_map
)
return
RobertaTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
kwargs
)
def
get_input_output_texts
(
self
):
input_text
=
u
"lower newer"
...
...
pytorch_transformers/tests/tokenization_tests_commons.py
View file @
69da972a
...
...
@@ -49,14 +49,19 @@ class CommonTestCases:
def
tearDown
(
self
):
shutil
.
rmtree
(
self
.
tmpdirname
)
def
get_tokenizer
(
self
):
def
get_tokenizer
(
self
,
**
kwargs
):
raise
NotImplementedError
def
get_input_output_texts
(
self
):
raise
NotImplementedError
def
test_save_and_load_tokenizer
(
self
):
# safety check on max_len default value so we are sure the test works
tokenizer
=
self
.
get_tokenizer
()
self
.
assertNotEqual
(
tokenizer
.
max_len
,
42
)
# Now let's start the test
tokenizer
=
self
.
get_tokenizer
(
max_len
=
42
)
before_tokens
=
tokenizer
.
encode
(
u
"He is very happy, UNwant
\u00E9
d,running"
)
...
...
@@ -64,8 +69,12 @@ class CommonTestCases:
tokenizer
.
save_pretrained
(
tmpdirname
)
tokenizer
=
tokenizer
.
from_pretrained
(
tmpdirname
)
after_tokens
=
tokenizer
.
encode
(
u
"He is very happy, UNwant
\u00E9
d,running"
)
self
.
assertListEqual
(
before_tokens
,
after_tokens
)
after_tokens
=
tokenizer
.
encode
(
u
"He is very happy, UNwant
\u00E9
d,running"
)
self
.
assertListEqual
(
before_tokens
,
after_tokens
)
self
.
assertEqual
(
tokenizer
.
max_len
,
42
)
tokenizer
=
tokenizer
.
from_pretrained
(
tmpdirname
,
max_len
=
43
)
self
.
assertEqual
(
tokenizer
.
max_len
,
43
)
def
test_pickle_tokenizer
(
self
):
tokenizer
=
self
.
get_tokenizer
()
...
...
pytorch_transformers/tests/tokenization_transfo_xl_test.py
View file @
69da972a
...
...
@@ -37,8 +37,9 @@ class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
with
open
(
self
.
vocab_file
,
"w"
,
encoding
=
'utf-8'
)
as
vocab_writer
:
vocab_writer
.
write
(
""
.
join
([
x
+
"
\n
"
for
x
in
vocab_tokens
]))
def
get_tokenizer
(
self
):
return
TransfoXLTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
lower_case
=
True
)
def
get_tokenizer
(
self
,
**
kwargs
):
kwargs
[
'lower_case'
]
=
True
return
TransfoXLTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
kwargs
)
def
get_input_output_texts
(
self
):
input_text
=
u
"<unk> UNwanted , running"
...
...
pytorch_transformers/tests/tokenization_xlm_test.py
View file @
69da972a
...
...
@@ -44,8 +44,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
with
open
(
self
.
merges_file
,
"w"
)
as
fp
:
fp
.
write
(
"
\n
"
.
join
(
merges
))
def
get_tokenizer
(
self
):
return
XLMTokenizer
.
from_pretrained
(
self
.
tmpdirname
)
def
get_tokenizer
(
self
,
**
kwargs
):
return
XLMTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
kwargs
)
def
get_input_output_texts
(
self
):
input_text
=
u
"lower newer"
...
...
pytorch_transformers/tests/tokenization_xlnet_test.py
View file @
69da972a
...
...
@@ -35,8 +35,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
tokenizer
=
XLNetTokenizer
(
SAMPLE_VOCAB
,
keep_accents
=
True
)
tokenizer
.
save_pretrained
(
self
.
tmpdirname
)
def
get_tokenizer
(
self
):
return
XLNetTokenizer
.
from_pretrained
(
self
.
tmpdirname
)
def
get_tokenizer
(
self
,
**
kwargs
):
return
XLNetTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
kwargs
)
def
get_input_output_texts
(
self
):
input_text
=
u
"This is a test"
...
...
pytorch_transformers/tokenization_utils.py
View file @
69da972a
...
...
@@ -332,7 +332,7 @@ class PreTrainedTokenizer(object):
tokenizer_config_file
=
resolved_vocab_files
.
pop
(
'tokenizer_config_file'
,
None
)
if
tokenizer_config_file
is
not
None
:
init_kwargs
=
json
.
load
(
open
(
tokenizer_config_file
,
encoding
=
"utf-8"
))
saved_init_inputs
=
init_kwargs
.
pop
(
'init_inputs'
,
[]
)
saved_init_inputs
=
init_kwargs
.
pop
(
'init_inputs'
,
()
)
if
not
init_inputs
:
init_inputs
=
saved_init_inputs
else
:
...
...
@@ -399,6 +399,8 @@ class PreTrainedTokenizer(object):
tokenizer_config
=
copy
.
deepcopy
(
self
.
init_kwargs
)
tokenizer_config
[
'init_inputs'
]
=
copy
.
deepcopy
(
self
.
init_inputs
)
for
file_id
in
self
.
vocab_files_names
.
keys
():
tokenizer_config
.
pop
(
file_id
,
None
)
with
open
(
tokenizer_config_file
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
f
.
write
(
json
.
dumps
(
tokenizer_config
,
ensure_ascii
=
False
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment