Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
b1978698
Commit
b1978698
authored
Jul 09, 2019
by
thomwolf
Browse files
unified tokenizer api and serialization + tests
parent
3d5f2913
Changes
34
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
712 additions
and
673 deletions
+712
-673
pytorch_transformers/tests/tokenization_gpt2_test.py
pytorch_transformers/tests/tokenization_gpt2_test.py
+25
-24
pytorch_transformers/tests/tokenization_openai_test.py
pytorch_transformers/tests/tokenization_openai_test.py
+25
-26
pytorch_transformers/tests/tokenization_tests_commons.py
pytorch_transformers/tests/tokenization_tests_commons.py
+54
-9
pytorch_transformers/tests/tokenization_transfo_xl_test.py
pytorch_transformers/tests/tokenization_transfo_xl_test.py
+14
-14
pytorch_transformers/tests/tokenization_utils_test.py
pytorch_transformers/tests/tokenization_utils_test.py
+10
-0
pytorch_transformers/tests/tokenization_xlm_test.py
pytorch_transformers/tests/tokenization_xlm_test.py
+25
-26
pytorch_transformers/tests/tokenization_xlnet_test.py
pytorch_transformers/tests/tokenization_xlnet_test.py
+29
-27
pytorch_transformers/tokenization_bert.py
pytorch_transformers/tokenization_bert.py
+47
-98
pytorch_transformers/tokenization_gpt2.py
pytorch_transformers/tokenization_gpt2.py
+23
-90
pytorch_transformers/tokenization_openai.py
pytorch_transformers/tokenization_openai.py
+23
-102
pytorch_transformers/tokenization_transfo_xl.py
pytorch_transformers/tokenization_transfo_xl.py
+22
-29
pytorch_transformers/tokenization_utils.py
pytorch_transformers/tokenization_utils.py
+360
-23
pytorch_transformers/tokenization_xlm.py
pytorch_transformers/tokenization_xlm.py
+29
-103
pytorch_transformers/tokenization_xlnet.py
pytorch_transformers/tokenization_xlnet.py
+26
-102
No files found.
pytorch_transformers/tests/tokenization_gpt2_test.py
View file @
b1978698
...
...
@@ -17,8 +17,9 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
os
import
unittest
import
json
import
tempfile
from
pytorch_transformers.tokenization_gpt2
import
GPT2Tokenizer
from
pytorch_transformers.tokenization_gpt2
import
GPT2Tokenizer
,
VOCAB_FILES_NAMES
from
.tokenization_tests_commons
import
create_and_check_tokenizer_commons
...
...
@@ -28,32 +29,32 @@ class GPT2TokenizationTest(unittest.TestCase):
""" Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
vocab
=
[
"l"
,
"o"
,
"w"
,
"e"
,
"r"
,
"s"
,
"t"
,
"i"
,
"d"
,
"n"
,
"lo"
,
"low"
,
"er"
,
"low"
,
"lowest"
,
"newer"
,
"wider"
]
"low"
,
"lowest"
,
"newer"
,
"wider"
,
"<unk>"
]
vocab_tokens
=
dict
(
zip
(
vocab
,
range
(
len
(
vocab
))))
merges
=
[
"#version: 0.2"
,
"l o"
,
"lo w"
,
"e r"
,
""
]
with
open
(
"/tmp/openai_tokenizer_vocab_test.json"
,
"w"
)
as
fp
:
special_tokens_map
=
{
"unk_token"
:
"<unk>"
}
with
tempfile
.
TemporaryDirectory
()
as
tmpdirname
:
vocab_file
=
os
.
path
.
join
(
tmpdirname
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
merges_file
=
os
.
path
.
join
(
tmpdirname
,
VOCAB_FILES_NAMES
[
'merges_file'
])
with
open
(
vocab_file
,
"w"
)
as
fp
:
fp
.
write
(
json
.
dumps
(
vocab_tokens
))
vocab_file
=
fp
.
name
with
open
(
"/tmp/openai_tokenizer_merges_test.txt"
,
"w"
)
as
fp
:
with
open
(
merges_file
,
"w"
)
as
fp
:
fp
.
write
(
"
\n
"
.
join
(
merges
))
merges_file
=
fp
.
name
create_and_check_tokenizer_commons
(
self
,
GPT2Tokenizer
,
vocab_file
,
merges_fil
e
,
special_tokens
=
[
"<unk>"
,
"<pad>"
]
)
create_and_check_tokenizer_commons
(
self
,
GPT2Tokenizer
,
tmpdirnam
e
,
**
special_tokens
_map
)
tokenizer
=
GPT2Tokenizer
(
vocab_file
,
merges_file
,
special_tokens
=
[
"<unk>"
,
"<pad>"
]
)
tokenizer
=
GPT2Tokenizer
(
vocab_file
,
merges_file
,
**
special_tokens
_map
)
text
=
"lower"
bpe_tokens
=
[
"low"
,
"er"
]
tokens
=
tokenizer
.
tokenize
(
text
)
self
.
assertListEqual
(
tokens
,
bpe_tokens
)
input_tokens
=
tokens
+
[
"<unk>"
]
input_bpe_tokens
=
[
13
,
12
,
1
6
]
input_tokens
=
tokens
+
[
tokenizer
.
unk_token
]
input_bpe_tokens
=
[
13
,
12
,
1
7
]
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
input_tokens
),
input_bpe_tokens
)
os
.
remove
(
vocab_file
)
os
.
remove
(
merges_file
)
if
__name__
==
'__main__'
:
unittest
.
main
()
pytorch_transformers/tests/tokenization_openai_test.py
View file @
b1978698
...
...
@@ -17,10 +17,9 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
os
import
unittest
import
json
import
shutil
import
pytest
import
tempfile
from
pytorch_transformers.tokenization_openai
import
OpenAIGPTTokenizer
from
pytorch_transformers.tokenization_openai
import
OpenAIGPTTokenizer
,
VOCAB_FILES_NAMES
from
.
tokenization_tests_commons
import
create_and_check_tokenizer_commons
...
...
@@ -32,21 +31,21 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
vocab
=
[
"l"
,
"o"
,
"w"
,
"e"
,
"r"
,
"s"
,
"t"
,
"i"
,
"d"
,
"n"
,
"w</w>"
,
"r</w>"
,
"t</w>"
,
"lo"
,
"low"
,
"er</w>"
,
"low</w>"
,
"lowest</w>"
,
"newer</w>"
,
"wider</w>"
]
"low</w>"
,
"lowest</w>"
,
"newer</w>"
,
"wider</w>"
,
"<unk>"
]
vocab_tokens
=
dict
(
zip
(
vocab
,
range
(
len
(
vocab
))))
merges
=
[
"#version: 0.2"
,
"l o"
,
"lo w"
,
"e r</w>"
,
""
]
with
open
(
"/tmp/openai_tokenizer_vocab_test.json"
,
"w"
)
as
fp
:
with
tempfile
.
TemporaryDirectory
()
as
tmpdirname
:
vocab_file
=
os
.
path
.
join
(
tmpdirname
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
merges_file
=
os
.
path
.
join
(
tmpdirname
,
VOCAB_FILES_NAMES
[
'merges_file'
])
with
open
(
vocab_file
,
"w"
)
as
fp
:
fp
.
write
(
json
.
dumps
(
vocab_tokens
))
vocab_file
=
fp
.
name
with
open
(
"/tmp/openai_tokenizer_merges_test.txt"
,
"w"
)
as
fp
:
with
open
(
merges_file
,
"w"
)
as
fp
:
fp
.
write
(
"
\n
"
.
join
(
merges
))
merges_file
=
fp
.
name
create_and_check_tokenizer_commons
(
self
,
OpenAIGPTTokenizer
,
vocab_file
,
merges_file
,
special_tokens
=
[
"<unk>"
,
"<pad>"
]
)
create_and_check_tokenizer_commons
(
self
,
OpenAIGPTTokenizer
,
tmpdirname
)
tokenizer
=
OpenAIGPTTokenizer
(
vocab_file
,
merges_file
,
special_tokens
=
[
"<unk>"
,
"<pad>"
])
os
.
remove
(
vocab_file
)
os
.
remove
(
merges_file
)
tokenizer
=
OpenAIGPTTokenizer
(
vocab_file
,
merges_file
)
text
=
"lower"
bpe_tokens
=
[
"low"
,
"er</w>"
]
...
...
pytorch_transformers/tests/tokenization_tests_commons.py
View file @
b1978698
...
...
@@ -17,6 +17,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
os
import
sys
from
io
import
open
import
tempfile
if
sys
.
version_info
[
0
]
==
3
:
unicode
=
str
...
...
@@ -28,22 +29,19 @@ else:
def
create_and_check_save_and_load_tokenizer
(
tester
,
tokenizer_class
,
*
inputs
,
**
kwargs
):
tokenizer
=
tokenizer_class
(
*
inputs
,
**
kwargs
)
tokenizer
=
tokenizer_class
.
from_pretrained
(
*
inputs
,
**
kwargs
)
before_tokens
=
tokenizer
.
encode
(
u
"He is very happy, UNwant
\u00E9
d,running"
)
vocab_path
=
"/tmp/"
output_files
=
tokenizer
.
save_vocabulary
(
vocab_path
=
vocab_path
)
tokenizer
=
tokenizer
.
from_pretrained
(
vocab_path
)
for
f
in
output_files
:
os
.
remove
(
f
)
with
tempfile
.
TemporaryDirectory
()
as
tmpdirname
:
tokenizer
.
save_pretrained
(
tmpdirname
)
tokenizer
=
tokenizer
.
from_pretrained
(
tmpdirname
)
after_tokens
=
tokenizer
.
encode
(
u
"He is very happy, UNwant
\u00E9
d,running"
)
tester
.
assertListEqual
(
before_tokens
,
after_tokens
)
def
create_and_check_pickle_tokenizer
(
tester
,
tokenizer_class
,
*
inputs
,
**
kwargs
):
tokenizer
=
tokenizer_class
(
*
inputs
,
**
kwargs
)
tokenizer
=
tokenizer_class
.
from_pretrained
(
*
inputs
,
**
kwargs
)
text
=
u
"Munich and Berlin are nice cities"
filename
=
u
"/tmp/tokenizer.bin"
...
...
@@ -58,8 +56,54 @@ def create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs
tester
.
assertListEqual
(
subwords
,
subwords_loaded
)
def
create_and_check_add_tokens_tokenizer
(
tester
,
tokenizer_class
,
*
inputs
,
**
kwargs
):
tokenizer
=
tokenizer_class
.
from_pretrained
(
*
inputs
,
**
kwargs
)
vocab_size
=
tokenizer
.
vocab_size
all_size
=
len
(
tokenizer
)
tester
.
assertNotEqual
(
vocab_size
,
0
)
tester
.
assertEqual
(
vocab_size
,
all_size
)
new_toks
=
[
"aaaaabbbbbb"
,
"cccccccccdddddddd"
]
added_toks
=
tokenizer
.
add_tokens
(
new_toks
)
vocab_size_2
=
tokenizer
.
vocab_size
all_size_2
=
len
(
tokenizer
)
tester
.
assertNotEqual
(
vocab_size_2
,
0
)
tester
.
assertEqual
(
vocab_size
,
vocab_size_2
)
tester
.
assertEqual
(
added_toks
,
len
(
new_toks
))
tester
.
assertEqual
(
all_size_2
,
all_size
+
len
(
new_toks
))
tokens
=
tokenizer
.
encode
(
"aaaaabbbbbb low cccccccccdddddddd l"
)
tester
.
assertGreaterEqual
(
len
(
tokens
),
4
)
tester
.
assertGreater
(
tokens
[
0
],
tokenizer
.
vocab_size
-
1
)
tester
.
assertGreater
(
tokens
[
-
2
],
tokenizer
.
vocab_size
-
1
)
new_toks_2
=
{
'eos_token'
:
">>>>|||<||<<|<<"
,
'pad_token'
:
"<<<<<|||>|>>>>|>"
}
added_toks_2
=
tokenizer
.
add_special_tokens
(
new_toks_2
)
vocab_size_3
=
tokenizer
.
vocab_size
all_size_3
=
len
(
tokenizer
)
tester
.
assertNotEqual
(
vocab_size_3
,
0
)
tester
.
assertEqual
(
vocab_size
,
vocab_size_3
)
tester
.
assertEqual
(
added_toks_2
,
len
(
new_toks_2
))
tester
.
assertEqual
(
all_size_3
,
all_size_2
+
len
(
new_toks_2
))
tokens
=
tokenizer
.
encode
(
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l"
)
tester
.
assertGreaterEqual
(
len
(
tokens
),
6
)
tester
.
assertGreater
(
tokens
[
0
],
tokenizer
.
vocab_size
-
1
)
tester
.
assertGreater
(
tokens
[
0
],
tokens
[
1
])
tester
.
assertGreater
(
tokens
[
-
2
],
tokenizer
.
vocab_size
-
1
)
tester
.
assertGreater
(
tokens
[
-
2
],
tokens
[
-
3
])
tester
.
assertEqual
(
tokens
[
0
],
tokenizer
.
convert_tokens_to_ids
(
tokenizer
.
eos_token
))
tester
.
assertEqual
(
tokens
[
-
2
],
tokenizer
.
convert_tokens_to_ids
(
tokenizer
.
pad_token
))
def
create_and_check_required_methods_tokenizer
(
tester
,
tokenizer_class
,
*
inputs
,
**
kwargs
):
tokenizer
=
tokenizer_class
(
*
inputs
,
**
kwargs
)
tokenizer
=
tokenizer_class
.
from_pretrained
(
*
inputs
,
**
kwargs
)
text
=
u
"He is very happy, UNwant
\u00E9
d,running"
tokens
=
tokenizer
.
tokenize
(
text
)
...
...
@@ -75,5 +119,6 @@ def create_and_check_required_methods_tokenizer(tester, tokenizer_class, *inputs
def
create_and_check_tokenizer_commons
(
tester
,
tokenizer_class
,
*
inputs
,
**
kwargs
):
create_and_check_required_methods_tokenizer
(
tester
,
tokenizer_class
,
*
inputs
,
**
kwargs
)
create_and_check_add_tokens_tokenizer
(
tester
,
tokenizer_class
,
*
inputs
,
**
kwargs
)
create_and_check_save_and_load_tokenizer
(
tester
,
tokenizer_class
,
*
inputs
,
**
kwargs
)
create_and_check_pickle_tokenizer
(
tester
,
tokenizer_class
,
*
inputs
,
**
kwargs
)
pytorch_transformers/tests/tokenization_transfo_xl_test.py
View file @
b1978698
...
...
@@ -17,10 +17,9 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
os
import
unittest
from
io
import
open
import
shutil
import
pytest
import
tempfile
from
pytorch_transformers.tokenization_transfo_xl
import
TransfoXLTokenizer
from
pytorch_transformers.tokenization_transfo_xl
import
TransfoXLTokenizer
,
VOCAB_FILES_NAMES
from
.
tokenization_tests_commons
import
create_and_check_tokenizer_commons
...
...
@@ -28,16 +27,17 @@ class TransfoXLTokenizationTest(unittest.TestCase):
def
test_full_tokenizer
(
self
):
vocab_tokens
=
[
"<unk>"
,
"[CLS]"
,
"[SEP]"
,
"want"
,
"unwanted"
,
"wa"
,
"un"
,
"running"
,
","
"<unk>"
,
"[CLS]"
,
"[SEP]"
,
"want"
,
"unwanted"
,
"wa"
,
"un"
,
"running"
,
","
,
"low"
,
"l"
,
]
with
open
(
"/tmp/transfo_xl_tokenizer_test.txt"
,
"w"
,
encoding
=
'utf-8'
)
as
vocab_writer
:
with
tempfile
.
TemporaryDirectory
()
as
tmpdirname
:
vocab_file
=
os
.
path
.
join
(
tmpdirname
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
with
open
(
vocab_file
,
"w"
,
encoding
=
'utf-8'
)
as
vocab_writer
:
vocab_writer
.
write
(
""
.
join
([
x
+
"
\n
"
for
x
in
vocab_tokens
]))
vocab_file
=
vocab_writer
.
name
create_and_check_tokenizer_commons
(
self
,
TransfoXLTokenizer
,
vocab_file
=
vocab_fil
e
,
lower_case
=
True
)
create_and_check_tokenizer_commons
(
self
,
TransfoXLTokenizer
,
tmpdirnam
e
,
lower_case
=
True
)
tokenizer
=
TransfoXLTokenizer
(
vocab_file
=
vocab_file
,
lower_case
=
True
)
os
.
remove
(
vocab_file
)
tokens
=
tokenizer
.
tokenize
(
u
"<unk> UNwanted , running"
)
self
.
assertListEqual
(
tokens
,
[
"<unk>"
,
"unwanted"
,
","
,
"running"
])
...
...
pytorch_transformers/tests/tokenization_utils_test.py
View file @
b1978698
...
...
@@ -17,6 +17,7 @@ from __future__ import division
from
__future__
import
print_function
import
unittest
import
six
from
pytorch_transformers
import
PreTrainedTokenizer
from
pytorch_transformers.tokenization_gpt2
import
GPT2Tokenizer
...
...
@@ -27,8 +28,17 @@ class TokenizerUtilsTest(unittest.TestCase):
for
model_name
in
s3_models
[:
1
]:
tokenizer
=
tokenizer_class
.
from_pretrained
(
model_name
)
self
.
assertIsNotNone
(
tokenizer
)
self
.
assertIsInstance
(
tokenizer
,
tokenizer_class
)
self
.
assertIsInstance
(
tokenizer
,
PreTrainedTokenizer
)
for
special_tok
in
tokenizer
.
all_special_tokens
:
if
six
.
PY2
:
self
.
assertIsInstance
(
special_tok
,
unicode
)
else
:
self
.
assertIsInstance
(
special_tok
,
str
)
special_tok_id
=
tokenizer
.
convert_tokens_to_ids
(
special_tok
)
self
.
assertIsInstance
(
special_tok_id
,
int
)
def
test_pretrained_tokenizers
(
self
):
self
.
check_tokenizer_from_pretrained
(
GPT2Tokenizer
)
...
...
pytorch_transformers/tests/tokenization_xlm_test.py
View file @
b1978698
...
...
@@ -17,10 +17,9 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
os
import
unittest
import
json
import
shutil
import
pytest
import
tempfile
from
pytorch_transformers.tokenization_xlm
import
XLMTokenizer
from
pytorch_transformers.tokenization_xlm
import
XLMTokenizer
,
VOCAB_FILES_NAMES
from
.tokenization_tests_commons
import
create_and_check_tokenizer_commons
...
...
@@ -31,21 +30,21 @@ class XLMTokenizationTest(unittest.TestCase):
vocab
=
[
"l"
,
"o"
,
"w"
,
"e"
,
"r"
,
"s"
,
"t"
,
"i"
,
"d"
,
"n"
,
"w</w>"
,
"r</w>"
,
"t</w>"
,
"lo"
,
"low"
,
"er</w>"
,
"low</w>"
,
"lowest</w>"
,
"newer</w>"
,
"wider</w>"
]
"low</w>"
,
"lowest</w>"
,
"newer</w>"
,
"wider</w>"
,
"<unk>"
]
vocab_tokens
=
dict
(
zip
(
vocab
,
range
(
len
(
vocab
))))
merges
=
[
"l o 123"
,
"lo w 1456"
,
"e r</w> 1789"
,
""
]
with
open
(
"/tmp/openai_tokenizer_vocab_test.json"
,
"w"
)
as
fp
:
with
tempfile
.
TemporaryDirectory
()
as
tmpdirname
:
vocab_file
=
os
.
path
.
join
(
tmpdirname
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
merges_file
=
os
.
path
.
join
(
tmpdirname
,
VOCAB_FILES_NAMES
[
'merges_file'
])
with
open
(
vocab_file
,
"w"
)
as
fp
:
fp
.
write
(
json
.
dumps
(
vocab_tokens
))
vocab_file
=
fp
.
name
with
open
(
"/tmp/openai_tokenizer_merges_test.txt"
,
"w"
)
as
fp
:
with
open
(
merges_file
,
"w"
)
as
fp
:
fp
.
write
(
"
\n
"
.
join
(
merges
))
merges_file
=
fp
.
name
create_and_check_tokenizer_commons
(
self
,
XLMTokenizer
,
vocab_file
,
merges_file
,
special_tokens
=
[
"<unk>"
,
"<pad>"
]
)
create_and_check_tokenizer_commons
(
self
,
XLMTokenizer
,
tmpdirname
)
tokenizer
=
XLMTokenizer
(
vocab_file
,
merges_file
,
special_tokens
=
[
"<unk>"
,
"<pad>"
])
os
.
remove
(
vocab_file
)
os
.
remove
(
merges_file
)
tokenizer
=
XLMTokenizer
(
vocab_file
,
merges_file
)
text
=
"lower"
bpe_tokens
=
[
"low"
,
"er</w>"
]
...
...
pytorch_transformers/tests/tokenization_xlnet_test.py
View file @
b1978698
...
...
@@ -16,10 +16,9 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
os
import
unittest
import
shutil
import
pytest
import
tempfile
from
pytorch_transformers.tokenization_xlnet
import
(
XLNetTokenizer
,
SPIECE_UNDERLINE
)
from
pytorch_transformers.tokenization_xlnet
import
(
XLNetTokenizer
,
SPIECE_UNDERLINE
,
VOCAB_FILES_NAMES
)
from
.
tokenization_tests_commons
import
create_and_check_tokenizer_commons
...
...
@@ -29,10 +28,13 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
class
XLNetTokenizationTest
(
unittest
.
TestCase
):
def
test_full_tokenizer
(
self
):
create_and_check_tokenizer_commons
(
self
,
XLNetTokenizer
,
SAMPLE_VOCAB
)
tokenizer
=
XLNetTokenizer
(
SAMPLE_VOCAB
,
keep_accents
=
True
)
with
tempfile
.
TemporaryDirectory
()
as
tmpdirname
:
tokenizer
.
save_pretrained
(
tmpdirname
)
create_and_check_tokenizer_commons
(
self
,
XLNetTokenizer
,
tmpdirname
)
tokens
=
tokenizer
.
tokenize
(
u
'This is a test'
)
self
.
assertListEqual
(
tokens
,
[
u
'▁This'
,
u
'▁is'
,
u
'▁a'
,
u
'▁t'
,
u
'est'
])
...
...
pytorch_transformers/tokenization_bert.py
View file @
b1978698
...
...
@@ -22,7 +22,6 @@ import os
import
unicodedata
from
io
import
open
from
.file_utils
import
cached_path
from
.tokenization_utils
import
PreTrainedTokenizer
,
clean_up_tokenization
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -45,7 +44,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
'bert-large-uncased-whole-word-masking-finetuned-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt"
,
'bert-large-cased-whole-word-masking-finetuned-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt"
,
'bert-base-cased-finetuned-mrpc'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt"
,
}}
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
'bert-base-uncased'
:
512
,
...
...
@@ -93,8 +93,9 @@ class BertTokenizer(PreTrainedTokenizer):
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def
__init__
(
self
,
vocab_file
,
do_lower_case
=
True
,
max_len
=
None
,
do_basic_tokenize
=
True
,
never_split
=
(
"[UNK]"
,
"[SEP]"
,
"[PAD]"
,
"[CLS]"
,
"[MASK]"
)):
def
__init__
(
self
,
vocab_file
,
do_lower_case
=
True
,
do_basic_tokenize
=
True
,
never_split
=
None
,
unk_token
=
"[UNK]"
,
sep_token
=
"[SEP]"
,
pad_token
=
"[PAD]"
,
cls_token
=
"[CLS]"
,
mask_token
=
"[MASK]"
,
**
kwargs
):
"""Constructs a BertTokenizer.
Args:
...
...
@@ -102,17 +103,18 @@ class BertTokenizer(PreTrainedTokenizer):
do_lower_case: Whether to lower case the input
Only has an effect when do_wordpiece_only=False
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
max_len: An artificial maximum length to truncate tokenized sequences to;
Effective maximum length is always the minimum of this
value (if specified) and the underlying BERT model's
sequence length.
never_split: List of tokens which will never be split during tokenization.
Only has an effect when do_wordpiece_only=False
"""
super
(
BertTokenizer
,
self
).
__init__
(
unk_token
=
unk_token
,
sep_token
=
sep_token
,
pad_token
=
pad_token
,
cls_token
=
cls_token
,
mask_token
=
mask_token
,
**
kwargs
)
if
not
os
.
path
.
isfile
(
vocab_file
):
raise
ValueError
(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
.
format
(
vocab_file
))
if
never_split
is
None
:
never_split
=
self
.
all_special_tokens
self
.
vocab
=
load_vocab
(
vocab_file
)
self
.
ids_to_tokens
=
collections
.
OrderedDict
(
[(
ids
,
tok
)
for
tok
,
ids
in
self
.
vocab
.
items
()])
...
...
@@ -120,90 +122,34 @@ class BertTokenizer(PreTrainedTokenizer):
if
do_basic_tokenize
:
self
.
basic_tokenizer
=
BasicTokenizer
(
do_lower_case
=
do_lower_case
,
never_split
=
never_split
)
self
.
wordpiece_tokenizer
=
WordpieceTokenizer
(
vocab
=
self
.
vocab
)
self
.
max_len
=
max_len
if
max_len
is
not
None
else
int
(
1e12
)
@
property
def
UNK_TOKEN
(
self
):
return
"[UNK]"
@
property
def
SEP_TOKEN
(
self
):
return
"[SEP]"
@
property
def
PAD_TOKEN
(
self
):
return
"[PAD]"
@
property
def
CLS_TOKEN
(
self
):
return
"[CLS]"
@
property
def
MASK_TOKEN
(
self
):
return
"[MASK]"
self
.
wordpiece_tokenizer
=
WordpieceTokenizer
(
vocab
=
self
.
vocab
,
unk_token
=
self
.
unk_token
)
@
property
def
UNK_ID
(
self
):
return
self
.
vocab
[
"[UNK]"
]
def
vocab_size
(
self
):
return
len
(
self
.
vocab
)
@
property
def
SEP_ID
(
self
):
return
self
.
vocab
[
"[SEP]"
]
@
property
def
PAD_ID
(
self
):
return
self
.
vocab
[
"[PAD]"
]
@
property
def
CLS_ID
(
self
):
return
self
.
vocab
[
"[CLS]"
]
@
property
def
MASK_ID
(
self
):
return
self
.
vocab
[
"[MASK]"
]
def
tokenize
(
self
,
text
):
def
_tokenize
(
self
,
text
):
split_tokens
=
[]
if
self
.
do_basic_tokenize
:
for
token
in
self
.
basic_tokenizer
.
tokenize
(
text
):
for
token
in
self
.
basic_tokenizer
.
tokenize
(
text
,
never_split
=
self
.
all_special_tokens
):
for
sub_token
in
self
.
wordpiece_tokenizer
.
tokenize
(
token
):
split_tokens
.
append
(
sub_token
)
else
:
split_tokens
=
self
.
wordpiece_tokenizer
.
tokenize
(
text
)
return
split_tokens
def
convert_tokens_to_ids
(
self
,
tokens
):
"""Converts a sequence of tokens into ids using the vocab."""
ids
=
[]
for
token
in
tokens
:
ids
.
append
(
self
.
vocab
[
token
])
if
len
(
ids
)
>
self
.
max_len
:
logger
.
warning
(
"Token indices sequence length is longer than the specified maximum "
" sequence length for this BERT model ({} > {}). Running this"
" sequence through BERT will result in indexing errors"
.
format
(
len
(
ids
),
self
.
max_len
)
)
return
ids
def
convert_ids_to_tokens
(
self
,
ids
):
"""Converts a sequence of ids in wordpiece tokens using the vocab."""
tokens
=
[]
for
i
in
ids
:
tokens
.
append
(
self
.
ids_to_tokens
[
i
])
return
tokens
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str/unicode) in an id using the vocab. """
return
self
.
vocab
.
get
(
token
,
self
.
vocab
.
get
(
self
.
unk_token
))
def
encode
(
self
,
text
):
return
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
))
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
return
self
.
ids_to_tokens
.
get
(
index
,
self
.
unk_token
)
def
decode
(
self
,
token_ids
,
clean_up_tokenization_spaces
=
True
):
def
_convert_ids_to_string
(
self
,
tokens_ids
):
"""Converts a sequence of ids in a string."""
tokens
=
self
.
convert_ids_to_tokens
(
token_ids
)
tokens
=
self
.
convert_ids_to_tokens
(
token
s
_ids
)
out_string
=
''
.
join
(
tokens
).
replace
(
' ##'
,
''
).
strip
()
if
clean_up_tokenization_spaces
:
for
special_tok
in
(
self
.
UNK_TOKEN
,
self
.
SEP_TOKEN
,
self
.
PAD_TOKEN
,
self
.
CLS_TOKEN
,
self
.
MASK_TOKEN
):
out_string
=
out_string
.
replace
(
special_tok
,
''
)
out_string
=
clean_up_tokenization
(
out_string
)
return
out_string
def
save_vocabulary
(
self
,
vocab_path
):
...
...
@@ -245,17 +191,20 @@ class BasicTokenizer(object):
def
__init__
(
self
,
do_lower_case
=
True
,
never_split
=
(
"[UNK]"
,
"[SEP]"
,
"[PAD]"
,
"[CLS]"
,
"[MASK]"
)
):
never_split
=
None
):
"""Constructs a BasicTokenizer.
Args:
do_lower_case: Whether to lower case the input.
"""
if
never_split
is
None
:
never_split
=
[]
self
.
do_lower_case
=
do_lower_case
self
.
never_split
=
never_split
def
tokenize
(
self
,
text
):
def
tokenize
(
self
,
text
,
never_split
=
None
):
"""Tokenizes a piece of text."""
never_split
=
self
.
never_split
+
(
never_split
if
never_split
is
not
None
else
[])
text
=
self
.
_clean_text
(
text
)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
...
...
@@ -267,7 +216,7 @@ class BasicTokenizer(object):
orig_tokens
=
whitespace_tokenize
(
text
)
split_tokens
=
[]
for
token
in
orig_tokens
:
if
self
.
do_lower_case
and
token
not
in
self
.
never_split
:
if
self
.
do_lower_case
and
token
not
in
never_split
:
token
=
token
.
lower
()
token
=
self
.
_run_strip_accents
(
token
)
split_tokens
.
extend
(
self
.
_run_split_on_punc
(
token
))
...
...
@@ -286,9 +235,9 @@ class BasicTokenizer(object):
output
.
append
(
char
)
return
""
.
join
(
output
)
def
_run_split_on_punc
(
self
,
text
):
def
_run_split_on_punc
(
self
,
text
,
never_split
=
None
):
"""Splits punctuation on a piece of text."""
if
text
in
self
.
never_split
:
if
never_split
is
not
None
and
text
in
never_split
:
return
[
text
]
chars
=
list
(
text
)
i
=
0
...
...
@@ -360,7 +309,7 @@ class BasicTokenizer(object):
class
WordpieceTokenizer
(
object
):
"""Runs WordPiece tokenization."""
def
__init__
(
self
,
vocab
,
unk_token
=
"[UNK]"
,
max_input_chars_per_word
=
100
):
def
__init__
(
self
,
vocab
,
unk_token
,
max_input_chars_per_word
=
100
):
self
.
vocab
=
vocab
self
.
unk_token
=
unk_token
self
.
max_input_chars_per_word
=
max_input_chars_per_word
...
...
pytorch_transformers/tokenization_gpt2.py
View file @
b1978698
...
...
@@ -38,7 +38,6 @@ logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES
=
{
'vocab_file'
:
'vocab.json'
,
'merges_file'
:
'merges.txt'
,
'special_tokens_file'
:
'special_tokens.txt'
}
PRETRAINED_VOCAB_FILES_MAP
=
{
...
...
@@ -52,11 +51,6 @@ PRETRAINED_VOCAB_FILES_MAP = {
'gpt2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt"
,
'gpt2-medium'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt"
,
},
'special_tokens_file'
:
{
'gpt2'
:
None
,
'gpt2-medium'
:
None
,
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
...
...
@@ -108,8 +102,10 @@ class GPT2Tokenizer(PreTrainedTokenizer):
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def
__init__
(
self
,
vocab_file
,
merges_file
,
special_tokens_file
=
None
,
special_tokens
=
None
,
errors
=
'replace'
,
max_len
=
None
):
self
.
max_len
=
max_len
if
max_len
is
not
None
else
int
(
1e12
)
def
__init__
(
self
,
vocab_file
,
merges_file
,
errors
=
'replace'
,
bos_token
=
"<|endoftext|>"
,
eos_token
=
"<|endoftext|>"
,
**
kwargs
):
super
(
GPT2Tokenizer
,
self
).
__init__
(
bos_token
=
bos_token
,
eos_token
=
eos_token
,
**
kwargs
)
self
.
encoder
=
json
.
load
(
open
(
vocab_file
))
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
self
.
errors
=
errors
# how to handle errors in decoding
...
...
@@ -123,32 +119,9 @@ class GPT2Tokenizer(PreTrainedTokenizer):
# Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self
.
pat
=
re
.
compile
(
r
"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
)
all_special_tokens
=
[]
if
special_tokens_file
is
not
None
:
special_tokens_to_add
=
open
(
special_tokens_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[:
-
1
]
all_special_tokens
.
extend
(
special_tokens_to_add
)
if
special_tokens
is
not
None
and
special_tokens
:
all_special_tokens
.
extend
(
special_tokens
)
self
.
special_tokens
=
{}
self
.
special_tokens_decoder
=
{}
self
.
set_special_tokens
(
all_special_tokens
)
def
__len__
(
self
):
return
len
(
self
.
encoder
)
+
len
(
self
.
special_tokens
)
def
set_special_tokens
(
self
,
special_tokens
):
""" Add a list of additional tokens to the encoder.
The additional tokens are indexed starting from the last index of the
current vocabulary in the order of the `special_tokens` list.
"""
if
not
special_tokens
:
self
.
special_tokens
=
{}
self
.
special_tokens_decoder
=
{}
return
self
.
special_tokens
=
dict
((
tok
,
len
(
self
.
encoder
)
+
i
)
for
i
,
tok
in
enumerate
(
special_tokens
))
self
.
special_tokens_decoder
=
{
v
:
k
for
k
,
v
in
self
.
special_tokens
.
items
()}
logger
.
info
(
"Special tokens {}"
.
format
(
self
.
special_tokens
))
@
property
def
vocab_size
(
self
):
return
len
(
self
.
encoder
)
def
bpe
(
self
,
token
):
if
token
in
self
.
cache
:
...
...
@@ -191,7 +164,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
self
.
cache
[
token
]
=
word
return
word
def
tokenize
(
self
,
text
):
def
_
tokenize
(
self
,
text
):
""" Tokenize a string. """
bpe_tokens
=
[]
for
token
in
re
.
findall
(
self
.
pat
,
text
):
...
...
@@ -202,57 +175,27 @@ class GPT2Tokenizer(PreTrainedTokenizer):
bpe_tokens
.
extend
(
bpe_token
for
bpe_token
in
self
.
bpe
(
token
).
split
(
' '
))
return
bpe_tokens
def
convert_tokens_to_ids
(
self
,
tokens
):
""" Converts a sequence of tokens into ids using the vocab. """
ids
=
[]
if
isinstance
(
tokens
,
str
)
or
(
sys
.
version_info
[
0
]
==
2
and
isinstance
(
tokens
,
unicode
)):
if
tokens
in
self
.
special_tokens
:
return
self
.
special_tokens
[
tokens
]
else
:
return
self
.
encoder
.
get
(
tokens
,
0
)
for
token
in
tokens
:
if
token
in
self
.
special_tokens
:
ids
.
append
(
self
.
special_tokens
[
token
])
else
:
ids
.
append
(
self
.
encoder
.
get
(
token
,
0
))
if
len
(
ids
)
>
self
.
max_len
:
logger
.
warning
(
"Token indices sequence length is longer than the specified maximum "
" sequence length for this OpenAI GPT model ({} > {}). Running this"
" sequence through the model will result in indexing errors"
.
format
(
len
(
ids
),
self
.
max_len
)
)
return
ids
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
"""Converts a sequence of ids in BPE tokens using the vocab."""
tokens
=
[]
for
i
in
ids
:
if
i
in
self
.
special_tokens_decoder
:
if
not
skip_special_tokens
:
tokens
.
append
(
self
.
special_tokens_decoder
[
i
])
else
:
tokens
.
append
(
self
.
decoder
[
i
])
return
tokens
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str/unicode) in an id using the vocab. """
return
self
.
encoder
.
get
(
token
,
self
.
encoder
.
get
(
self
.
unk_token
))
def
encode
(
self
,
text
):
return
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
))
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
return
self
.
decoder
.
get
(
index
,
self
.
unk_token
)
def
decode
(
self
,
tokens
,
skip_special_tokens
=
False
,
clean_up_tokenization_spaces
=
True
):
text
=
''
.
join
(
self
.
convert_ids_to_tokens
(
tokens
,
skip_special_tokens
=
skip_special_tokens
))
def
_convert_ids_to_string
(
self
,
tokens_ids
):
"""Converts a sequence of ids in a string."""
text
=
''
.
join
(
tokens_ids
)
text
=
bytearray
([
self
.
byte_decoder
[
c
]
for
c
in
text
]).
decode
(
'utf-8'
,
errors
=
self
.
errors
)
if
clean_up_tokenization_spaces
:
text
=
text
.
replace
(
'<unk>'
,
''
)
text
=
clean_up_tokenization
(
text
)
return
text
def
save_vocabulary
(
self
,
vocab_path
):
def
save_vocabulary
(
self
,
save_directory
):
"""Save the tokenizer vocabulary and merge files to a directory."""
if
not
os
.
path
.
isdir
(
vocab_path
):
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
vocab_path
))
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
save_directory
))
return
vocab_file
=
os
.
path
.
join
(
vocab_path
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
merge_file
=
os
.
path
.
join
(
vocab_path
,
VOCAB_FILES_NAMES
[
'merges_file'
])
special_tokens_file
=
os
.
path
.
join
(
vocab_path
,
VOCAB_FILES_NAMES
[
'special_tokens_file'
])
vocab_file
=
os
.
path
.
join
(
save_directory
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
merge_file
=
os
.
path
.
join
(
save_directory
,
VOCAB_FILES_NAMES
[
'merges_file'
])
with
open
(
vocab_file
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
f
.
write
(
json
.
dumps
(
self
.
encoder
,
ensure_ascii
=
False
))
...
...
@@ -268,14 +211,4 @@ class GPT2Tokenizer(PreTrainedTokenizer):
writer
.
write
(
' '
.
join
(
bpe_tokens
)
+
u
'
\n
'
)
index
+=
1
index
=
len
(
self
.
encoder
)
with
open
(
special_tokens_file
,
'w'
,
encoding
=
'utf-8'
)
as
writer
:
for
token
,
token_index
in
sorted
(
self
.
special_tokens
.
items
(),
key
=
lambda
kv
:
kv
[
1
]):
if
index
!=
token_index
:
logger
.
warning
(
"Saving special tokens vocabulary to {}: BPE indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
.
format
(
special_tokens_file
))
index
=
token_index
writer
.
write
(
token
+
u
'
\n
'
)
index
+=
1
return
vocab_file
,
merge_file
,
special_tokens_file
return
vocab_file
,
merge_file
pytorch_transformers/tokenization_openai.py
View file @
b1978698
...
...
@@ -20,13 +20,9 @@ import json
import
logging
import
os
import
re
import
sys
from
io
import
open
from
tqdm
import
tqdm
from
.file_utils
import
cached_path
from
.tokenization_utils
import
PreTrainedTokenizer
,
clean_up_tokenization
from
.tokenization_utils
import
PreTrainedTokenizer
from
.tokenization_bert
import
BasicTokenizer
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -34,7 +30,6 @@ logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES
=
{
'vocab_file'
:
'vocab.json'
,
'merges_file'
:
'merges.txt'
,
'special_tokens_file'
:
'special_tokens.txt'
}
PRETRAINED_VOCAB_FILES_MAP
=
{
...
...
@@ -46,10 +41,6 @@ PRETRAINED_VOCAB_FILES_MAP = {
{
'openai-gpt'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"
,
},
'special_tokens_file'
:
{
'openai-gpt'
:
None
,
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
...
...
@@ -88,14 +79,14 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
BPE tokenizer. Peculiarities:
- lower case all inputs
- uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
- argument special_tokens and function set_special_tokens:
can be used to add additional symbols (ex: "__classify__") to a vocabulary.
"""
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def
__init__
(
self
,
vocab_file
,
merges_file
,
special_tokens_file
=
None
,
special_tokens
=
None
,
max_len
=
None
):
def
__init__
(
self
,
vocab_file
,
merges_file
,
unk_token
=
"<unk>"
,
**
kwargs
):
super
(
OpenAIGPTTokenizer
,
self
).
__init__
(
unk_token
=
unk_token
,
**
kwargs
)
try
:
import
ftfy
import
spacy
...
...
@@ -103,11 +94,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
self
.
fix_text
=
ftfy
.
fix_text
except
ImportError
:
logger
.
warning
(
"ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy."
)
self
.
nlp
=
BasicTokenizer
(
do_lower_case
=
True
,
never_split
=
special_tokens
if
special_tokens
is
not
None
else
[])
self
.
nlp
=
BasicTokenizer
(
do_lower_case
=
True
)
self
.
fix_text
=
None
self
.
max_len
=
max_len
if
max_len
is
not
None
else
int
(
1e12
)
self
.
encoder
=
json
.
load
(
open
(
vocab_file
,
encoding
=
"utf-8"
))
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
merges
=
open
(
merges_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[
1
:
-
1
]
...
...
@@ -115,35 +104,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
self
.
bpe_ranks
=
dict
(
zip
(
merges
,
range
(
len
(
merges
))))
self
.
cache
=
{}
all_special_tokens
=
[]
if
special_tokens_file
is
not
None
:
special_tokens_to_add
=
open
(
special_tokens_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[:
-
1
]
all_special_tokens
.
extend
(
special_tokens_to_add
)
if
special_tokens
is
not
None
and
special_tokens
:
all_special_tokens
.
extend
(
special_tokens
)
self
.
special_tokens
=
{}
self
.
special_tokens_decoder
=
{}
self
.
set_special_tokens
(
all_special_tokens
)
def
__len__
(
self
):
return
len
(
self
.
encoder
)
+
len
(
self
.
special_tokens
)
def
set_special_tokens
(
self
,
special_tokens
):
""" Add a list of additional tokens to the encoder.
The additional tokens are indexed starting from the last index of the
current vocabulary in the order of the `special_tokens` list.
"""
if
not
special_tokens
:
self
.
special_tokens
=
{}
self
.
special_tokens_decoder
=
{}
return
self
.
special_tokens
=
dict
((
tok
,
len
(
self
.
encoder
)
+
i
)
for
i
,
tok
in
enumerate
(
special_tokens
))
self
.
special_tokens_decoder
=
{
v
:
k
for
k
,
v
in
self
.
special_tokens
.
items
()}
if
self
.
fix_text
is
None
:
# Using BERT's BasicTokenizer: we can update the tokenizer
self
.
nlp
.
never_split
=
special_tokens
logger
.
info
(
"Special tokens {}"
.
format
(
self
.
special_tokens
))
@
property
def
vocab_size
(
self
):
return
len
(
self
.
encoder
)
def
bpe
(
self
,
token
):
word
=
tuple
(
token
[:
-
1
])
+
(
token
[
-
1
]
+
'</w>'
,)
...
...
@@ -188,7 +151,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
self
.
cache
[
token
]
=
word
return
word
def
tokenize
(
self
,
text
):
def
_
tokenize
(
self
,
text
):
""" Tokenize a string. """
split_tokens
=
[]
if
self
.
fix_text
is
None
:
...
...
@@ -203,58 +166,26 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
split_tokens
.
extend
([
t
for
t
in
self
.
bpe
(
token
.
text
.
lower
()).
split
(
' '
)])
return
split_tokens
def
convert_tokens_to_ids
(
self
,
tokens
):
""" Converts a sequence of tokens into ids using the vocab. """
ids
=
[]
if
isinstance
(
tokens
,
str
)
or
(
sys
.
version_info
[
0
]
==
2
and
isinstance
(
tokens
,
unicode
)):
if
tokens
in
self
.
special_tokens
:
return
self
.
special_tokens
[
tokens
]
else
:
return
self
.
encoder
.
get
(
tokens
,
0
)
for
token
in
tokens
:
if
token
in
self
.
special_tokens
:
ids
.
append
(
self
.
special_tokens
[
token
])
else
:
ids
.
append
(
self
.
encoder
.
get
(
token
,
0
))
if
len
(
ids
)
>
self
.
max_len
:
logger
.
warning
(
"Token indices sequence length is longer than the specified maximum "
" sequence length for this OpenAI GPT model ({} > {}). Running this"
" sequence through the model will result in indexing errors"
.
format
(
len
(
ids
),
self
.
max_len
)
)
return
ids
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
"""Converts a sequence of ids in BPE tokens using the vocab."""
tokens
=
[]
for
i
in
ids
:
if
i
in
self
.
special_tokens_decoder
:
if
not
skip_special_tokens
:
tokens
.
append
(
self
.
special_tokens_decoder
[
i
])
else
:
tokens
.
append
(
self
.
decoder
[
i
])
return
tokens
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str/unicode) in an id using the vocab. """
return
self
.
encoder
.
get
(
token
,
self
.
encoder
.
get
(
self
.
unk_token
))
def
encode
(
self
,
text
):
return
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
))
def
_convert_id_to_token
(
self
,
index
):
"""Converts an id in a token (BPE) using the vocab."""
return
self
.
decoder
.
get
(
index
,
self
.
unk_token
)
def
decode
(
self
,
ids
,
skip_special_tokens
=
False
,
clean_up_tokenization_spaces
=
True
):
def
_convert_ids_to_string
(
self
,
tokens_ids
):
"""Converts a sequence of ids in a string."""
tokens
=
self
.
convert_ids_to_tokens
(
ids
,
skip_special_tokens
=
skip_special_tokens
)
out_string
=
''
.
join
(
tokens
).
replace
(
'</w>'
,
' '
).
strip
()
if
clean_up_tokenization_spaces
:
out_string
=
out_string
.
replace
(
'<unk>'
,
''
)
out_string
=
clean_up_tokenization
(
out_string
)
out_string
=
''
.
join
(
tokens_ids
).
replace
(
'</w>'
,
' '
).
strip
()
return
out_string
def
save_vocabulary
(
self
,
vocab_path
):
def
save_vocabulary
(
self
,
save_directory
):
"""Save the tokenizer vocabulary and merge files to a directory."""
if
not
os
.
path
.
isdir
(
vocab_path
):
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
vocab_path
))
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
save_directory
))
return
vocab_file
=
os
.
path
.
join
(
vocab_path
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
merge_file
=
os
.
path
.
join
(
vocab_path
,
VOCAB_FILES_NAMES
[
'merges_file'
])
special_tokens_file
=
os
.
path
.
join
(
vocab_path
,
VOCAB_FILES_NAMES
[
'special_tokens_file'
])
vocab_file
=
os
.
path
.
join
(
save_directory
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
merge_file
=
os
.
path
.
join
(
save_directory
,
VOCAB_FILES_NAMES
[
'merges_file'
])
with
open
(
vocab_file
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
f
.
write
(
json
.
dumps
(
self
.
encoder
,
ensure_ascii
=
False
))
...
...
@@ -270,14 +201,4 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
writer
.
write
(
' '
.
join
(
bpe_tokens
)
+
u
'
\n
'
)
index
+=
1
index
=
len
(
self
.
encoder
)
with
open
(
special_tokens_file
,
'w'
,
encoding
=
'utf-8'
)
as
writer
:
for
token
,
token_index
in
sorted
(
self
.
special_tokens
.
items
(),
key
=
lambda
kv
:
kv
[
1
]):
if
index
!=
token_index
:
logger
.
warning
(
"Saving special tokens vocabulary to {}: BPE indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
.
format
(
special_tokens_file
))
index
=
token_index
writer
.
write
(
token
+
u
'
\n
'
)
index
+=
1
return
vocab_file
,
merge_file
,
special_tokens_file
return
vocab_file
,
merge_file
pytorch_transformers/tokenization_transfo_xl.py
View file @
b1978698
...
...
@@ -41,7 +41,7 @@ else:
logger
=
logging
.
getLogger
(
__name__
)
VOCAB_FILES_NAMES
=
{
'pretrained_vocab_file'
:
'vocab.bin'
}
VOCAB_FILES_NAMES
=
{
'pretrained_vocab_file'
:
'vocab.bin'
,
'vocab_file'
:
'vocab.txt'
}
PRETRAINED_VOCAB_FILES_MAP
=
{
'pretrained_vocab_file'
:
...
...
@@ -67,9 +67,17 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def
__init__
(
self
,
special
=
[]
,
min_freq
=
0
,
max_size
=
None
,
lower_case
=
False
,
def
__init__
(
self
,
special
=
None
,
min_freq
=
0
,
max_size
=
None
,
lower_case
=
False
,
delimiter
=
None
,
vocab_file
=
None
,
pretrained_vocab_file
=
None
,
never_split
=
(
"<unk>"
,
"<eos>"
,
"<formula>"
)):
never_split
=
None
,
unk_token
=
"<unk>"
,
eos_token
=
"<eos>"
,
additional_special_tokens
=
[
"<formula>"
],
**
kwargs
):
super
(
TransfoXLTokenizer
,
self
).
__init__
(
unk_token
=
unk_token
,
eos_token
=
eos_token
,
additional_special_tokens
=
additional_special_tokens
,
**
kwargs
)
if
never_split
is
None
:
never_split
=
self
.
all_special_tokens
if
special
is
None
:
special
=
[]
self
.
counter
=
Counter
()
self
.
special
=
special
self
.
min_freq
=
min_freq
...
...
@@ -200,11 +208,13 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
self
.
idx2sym
.
append
(
sym
)
self
.
sym2idx
[
sym
]
=
len
(
self
.
idx2sym
)
-
1
def
get_sym
(
self
,
idx
):
def
_convert_id_to_token
(
self
,
idx
):
"""Converts an id in a token (BPE) using the vocab."""
assert
0
<=
idx
<
len
(
self
),
'Index {} out of vocabulary range'
.
format
(
idx
)
return
self
.
idx2sym
[
idx
]
def
get_idx
(
self
,
sym
):
def
_convert_token_to_id
(
self
,
sym
):
""" Converts a token (str/unicode) in an id using the vocab. """
if
sym
in
self
.
sym2idx
:
return
self
.
sym2idx
[
sym
]
else
:
...
...
@@ -220,36 +230,19 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
else
:
raise
ValueError
(
'Token not in vocabulary and no <unk> token in vocabulary for replacement'
)
def
convert_ids_to_tokens
(
self
,
indices
):
"""Converts a sequence of indices in symbols using the vocab."""
return
[
self
.
get_sym
(
idx
)
for
idx
in
indices
]
def
convert_tokens_to_ids
(
self
,
symbols
):
"""Converts a sequence of symbols into ids using the vocab."""
return
[
self
.
get_idx
(
sym
)
for
sym
in
symbols
]
def
_convert_ids_to_string
(
self
,
tokens_ids
):
"""Converts a sequence of ids in a string."""
out_string
=
' '
.
join
(
tokens_ids
).
strip
()
return
out_string
def
convert_to_tensor
(
self
,
symbols
):
return
torch
.
LongTensor
(
self
.
convert_tokens_to_ids
(
symbols
))
def
encode
(
self
,
text
):
return
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
))
def
decode
(
self
,
indices
,
exclude
=
None
,
clean_up_tokenization_spaces
=
True
):
"""Converts a sequence of indices in a string."""
if
exclude
is
None
:
out_string
=
' '
.
join
([
self
.
get_sym
(
idx
)
for
idx
in
indices
])
else
:
out_string
=
' '
.
join
([
self
.
get_sym
(
idx
)
for
idx
in
indices
if
idx
not
in
exclude
])
if
clean_up_tokenization_spaces
:
out_string
=
clean_up_tokenization
(
out_string
)
return
out_string
def
__len__
(
self
):
@
property
def
vocab_size
(
self
):
return
len
(
self
.
idx2sym
)
def
tokenize
(
self
,
line
,
add_eos
=
False
,
add_double_eos
=
False
):
def
_
tokenize
(
self
,
line
,
add_eos
=
False
,
add_double_eos
=
False
):
line
=
line
.
strip
()
# convert to lower case
if
self
.
lower_case
:
...
...
pytorch_transformers/tokenization_utils.py
View file @
b1978698
...
...
@@ -16,37 +16,145 @@
from
__future__
import
(
absolute_import
,
division
,
print_function
,
unicode_literals
)
import
sys
import
json
import
logging
import
os
import
regex
as
re
import
json
import
six
from
io
import
open
try
:
from
functools
import
lru_cache
except
ImportError
:
# Just a dummy decorator to get the checks to run on python2
# because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
def
lru_cache
():
return
lambda
func
:
func
from
.file_utils
import
cached_path
logger
=
logging
.
getLogger
(
__name__
)
SPECIAL_TOKENS_MAP_FILE
=
'special_tokens_map.json'
ADDED_TOKENS_FILE
=
'added_tokens.json'
class
PreTrainedTokenizer
(
object
):
""" An abstract class to handle dowloading and loading pretrained tokenizers.
""" An abstract class to handle dowloading and loading pretrained tokenizers and adding tokens to the vocabulary.
Derived class can set up a few special tokens to be used in common scripts and internals:
bos_token, eos_token, EOP_TOKEN, EOD_TOKEN, unk_token, sep_token, pad_token, cls_token, mask_token
additional_special_tokens = []
We defined an added_tokens_encoder to add new tokens to the vocabulary without having to handle the
specific vocabulary augmentation methods of the various underlying dictionnary structures (BPE, sentencepiece...).
"""
vocab_files_names
=
{}
pretrained_vocab_files_map
=
{}
max_model_input_sizes
=
{}
SPECIAL_TOKENS_ATTRIBUTES
=
[
"bos_token"
,
"eos_token"
,
"unk_token"
,
"sep_token"
,
"pad_token"
,
"cls_token"
,
"mask_token"
,
"additional_special_tokens"
]
@
property
def
bos_token
(
self
):
if
self
.
_bos_token
is
None
:
logger
.
error
(
"Using bos_token, but it is not set yet."
)
return
self
.
_bos_token
@
property
def
eos_token
(
self
):
if
self
.
_eos_token
is
None
:
logger
.
error
(
"Using eos_token, but it is not set yet."
)
return
self
.
_eos_token
@
property
def
unk_token
(
self
):
if
self
.
_unk_token
is
None
:
logger
.
error
(
"Using unk_token, but it is not set yet."
)
return
self
.
_unk_token
@
property
def
sep_token
(
self
):
if
self
.
_sep_token
is
None
:
logger
.
error
(
"Using sep_token, but it is not set yet."
)
return
self
.
_sep_token
@
property
def
pad_token
(
self
):
if
self
.
_pad_token
is
None
:
logger
.
error
(
"Using pad_token, but it is not set yet."
)
return
self
.
_pad_token
@
property
def
cls_token
(
self
):
if
self
.
_cls_token
is
None
:
logger
.
error
(
"Using cls_token, but it is not set yet."
)
return
self
.
_cls_token
@
property
def
mask_token
(
self
):
if
self
.
_mask_token
is
None
:
logger
.
error
(
"Using mask_token, but it is not set yet."
)
return
self
.
_mask_token
@
property
def
additional_special_tokens
(
self
):
if
self
.
_additional_special_tokens
is
None
:
logger
.
error
(
"Using additional_special_tokens, but it is not set yet."
)
return
self
.
_additional_special_tokens
@
bos_token
.
setter
def
bos_token
(
self
,
value
):
self
.
_bos_token
=
value
@
eos_token
.
setter
def
eos_token
(
self
,
value
):
self
.
_eos_token
=
value
@
unk_token
.
setter
def
unk_token
(
self
,
value
):
self
.
_unk_token
=
value
@
sep_token
.
setter
def
sep_token
(
self
,
value
):
self
.
_sep_token
=
value
@
pad_token
.
setter
def
pad_token
(
self
,
value
):
self
.
_pad_token
=
value
@
cls_token
.
setter
def
cls_token
(
self
,
value
):
self
.
_cls_token
=
value
@
mask_token
.
setter
def
mask_token
(
self
,
value
):
self
.
_mask_token
=
value
@
additional_special_tokens
.
setter
def
additional_special_tokens
(
self
,
value
):
self
.
_additional_special_tokens
=
value
def
__init__
(
self
,
max_len
=
None
,
**
kwargs
):
self
.
_bos_token
=
None
self
.
_eos_token
=
None
self
.
_unk_token
=
None
self
.
_sep_token
=
None
self
.
_pad_token
=
None
self
.
_cls_token
=
None
self
.
_mask_token
=
None
self
.
_additional_special_tokens
=
[]
self
.
max_len
=
max_len
if
max_len
is
not
None
else
int
(
1e12
)
self
.
added_tokens_encoder
=
{}
self
.
added_tokens_decoder
=
{}
for
key
,
value
in
kwargs
.
items
():
if
key
not
in
self
.
SPECIAL_TOKENS_ATTRIBUTES
:
raise
ValueError
(
"PreTrainedTokenizer.__init__() argument {} should be in {}"
.
format
(
key
,
', '
.
join
(
self
.
SPECIAL_TOKENS_ATTRIBUTES
)))
else
:
setattr
(
self
,
key
,
value
)
@
classmethod
def
from_pretrained
(
cls
,
*
inputs
,
**
kwargs
):
return
cls
.
_from_pretrained
(
*
inputs
,
**
kwargs
)
@
classmethod
def
_from_pretrained
(
cls
,
pretrained_model_name_or_path
,
cache_dir
=
None
,
*
inputs
,
**
kwargs
):
"""
...
...
@@ -59,16 +167,20 @@ class PreTrainedTokenizer(object):
for
file_id
,
map_list
in
cls
.
pretrained_vocab_files_map
.
items
():
vocab_files
[
file_id
]
=
map_list
[
pretrained_model_name_or_path
]
else
:
for
file_id
,
file_name
in
cls
.
vocab_files_names
.
items
():
all_vocab_files_names
=
{
'added_tokens_file'
:
ADDED_TOKENS_FILE
,
'special_tokens_map_file'
:
SPECIAL_TOKENS_MAP_FILE
}
all_vocab_files_names
.
update
(
cls
.
vocab_files_names
)
for
file_id
,
file_name
in
all_vocab_files_names
.
items
():
if
os
.
path
.
isdir
(
pretrained_model_name_or_path
):
full_file_name
=
os
.
path
.
join
(
pretrained_model_name_or_path
,
file_name
)
else
:
full_file_name
=
pretrained_model_name_or_path
if
not
os
.
path
.
exists
(
full_file_name
):
logger
.
info
(
"Didn't find file {}. We
d
on't load it."
.
format
(
full_file_name
))
logger
.
info
(
"Didn't find file {}. We
w
on't load it."
.
format
(
full_file_name
))
full_file_name
=
None
vocab_files
[
file_id
]
=
full_file_name
# redirect to the cache, if necessary
# Get files from url, cache, or disk depending on the case
try
:
resolved_vocab_files
=
{}
for
file_id
,
file_path
in
vocab_files
.
items
():
...
...
@@ -95,6 +207,7 @@ class PreTrainedTokenizer(object):
logger
.
info
(
"loading file {} from cache at {}"
.
format
(
file_path
,
resolved_vocab_files
[
file_id
]))
# Set max length if needed
if
pretrained_model_name_or_path
in
cls
.
max_model_input_sizes
:
# if we're using a pretrained model, ensure the tokenizer
# wont index sequences longer than the number of positional embeddings
...
...
@@ -102,33 +215,257 @@ class PreTrainedTokenizer(object):
kwargs
[
'max_len'
]
=
min
(
kwargs
.
get
(
'max_len'
,
int
(
1e12
)),
max_len
)
# Merge resolved_vocab_files arguments in kwargs.
added_tokens_file
=
resolved_vocab_files
.
pop
(
'added_tokens_file'
,
None
)
special_tokens_map_file
=
resolved_vocab_files
.
pop
(
'special_tokens_map_file'
,
None
)
for
args_name
,
file_path
in
resolved_vocab_files
.
items
():
if
args_name
not
in
kwargs
:
kwargs
[
args_name
]
=
file_path
if
special_tokens_map_file
is
not
None
:
special_tokens_map
=
json
.
load
(
open
(
special_tokens_map_file
,
encoding
=
"utf-8"
))
for
key
,
value
in
special_tokens_map
.
items
():
if
key
not
in
kwargs
:
kwargs
[
key
]
=
value
# Instantiate tokenizer.
tokenizer
=
cls
(
*
inputs
,
**
kwargs
)
# Add supplementary tokens.
if
added_tokens_file
is
not
None
:
added_tokens
=
json
.
load
(
open
(
added_tokens_file
,
encoding
=
"utf-8"
))
added_tok_encoder
=
dict
((
tok
,
len
(
tokenizer
)
+
i
)
for
i
,
tok
in
enumerate
(
added_tokens
))
added_tok_decoder
=
{
v
:
k
for
k
,
v
in
added_tok_encoder
.
items
()}
tokenizer
.
added_tokens_encoder
.
update
(
added_tok_encoder
)
tokenizer
.
added_tokens_decoder
.
update
(
added_tok_decoder
)
return
tokenizer
def
tokenize
(
self
,
text
):
raise
NotImplementedError
def
convert_tokens_to_ids
(
self
,
tokens
):
def
save_pretrained
(
self
,
save_directory
):
""" Save the tokenizer vocabulary files (with added tokens) and the
special-tokens-to-class-attributes-mapping to a directory, so that it
can be re-loaded using the `from_pretrained(save_directory)` class method.
"""
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
"Saving directory ({}) should be a directory"
.
format
(
save_directory
))
return
special_tokens_map_file
=
os
.
path
.
join
(
save_directory
,
SPECIAL_TOKENS_MAP_FILE
)
added_tokens_file
=
os
.
path
.
join
(
save_directory
,
ADDED_TOKENS_FILE
)
with
open
(
special_tokens_map_file
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
f
.
write
(
json
.
dumps
(
self
.
special_tokens_map
,
ensure_ascii
=
False
))
with
open
(
added_tokens_file
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
f
.
write
(
json
.
dumps
(
self
.
added_tokens_decoder
,
ensure_ascii
=
False
))
vocab_files
=
self
.
save_vocabulary
(
save_directory
)
return
vocab_files
+
(
special_tokens_map_file
,
added_tokens_file
)
def
save_vocabulary
(
self
,
save_directory
):
""" Save the tokenizer vocabulary to a directory. This method doesn't save added tokens
and special token mappings.
Please use `save_pretrained()` to save the full Tokenizer state so that it can be
reloaded using the `from_pretrained(save_directory)` class method.
"""
raise
NotImplementedError
def
convert_ids_to_tokens
(
self
,
ids
):
def
vocab_size
(
self
):
raise
NotImplementedError
def
encode
(
self
,
text
):
def
__len__
(
self
):
return
self
.
vocab_size
+
len
(
self
.
added_tokens_encoder
)
def
add_tokens
(
self
,
new_tokens
):
""" Add a list of new tokens to the tokenizer class. If the new tokens are not in the
vocabulary, they are added to the added_tokens_encoder with indices starting from
the last index of the current vocabulary.
Returns:
Number of tokens added to the vocabulary which can be used to correspondingly
increase the size of the associated model embedding matrices.
"""
if
not
new_tokens
:
return
0
to_add_tokens
=
[]
for
token
in
new_tokens
:
if
self
.
convert_tokens_to_ids
(
token
)
==
self
.
convert_tokens_to_ids
(
self
.
unk_token
):
to_add_tokens
.
append
(
token
)
logger
.
info
(
"Adding %s to the vocabulary"
,
token
)
added_tok_encoder
=
dict
((
tok
,
len
(
self
)
+
i
)
for
i
,
tok
in
enumerate
(
to_add_tokens
))
added_tok_decoder
=
{
v
:
k
for
k
,
v
in
added_tok_encoder
.
items
()}
self
.
added_tokens_encoder
.
update
(
added_tok_encoder
)
self
.
added_tokens_decoder
.
update
(
added_tok_decoder
)
return
len
(
to_add_tokens
)
def
add_special_tokens
(
self
,
special_tokens_dict
):
""" Add a dictionnary of special tokens (eos, pad, cls...) to the encoder and link them
to class attributes. If the special tokens are not in the vocabulary, they are added
to it and indexed starting from the last index of the current vocabulary.
Returns:
Number of tokens added to the vocabulary which can be used to correspondingly
increase the size of the associated model embedding matrices.
"""
if
not
special_tokens_dict
:
return
0
added_special_tokens
=
self
.
add_tokens
(
special_tokens_dict
.
values
())
for
key
,
value
in
special_tokens_dict
.
items
():
logger
.
info
(
"Assigning %s to the %s key of the tokenizer"
,
value
,
key
)
setattr
(
self
,
key
,
value
)
return
added_special_tokens
def
tokenize
(
self
,
text
,
**
kwargs
):
""" Converts a string in a sequence of tokens (string), using the tokenizer.
Split in words for word-based vocabulary or sub-words for sub-word-based
vocabularies (BPE/SentencePieces/WordPieces).
Take care of added tokens.
"""
def
split_on_tokens
(
tok_list
,
text
):
if
not
text
:
return
[]
if
not
tok_list
:
return
self
.
_tokenize
(
text
,
**
kwargs
)
tok
=
tok_list
[
0
]
split_text
=
text
.
split
(
tok
)
return
sum
((
split_on_tokens
(
tok_list
[
1
:],
sub_text
.
strip
())
+
[
tok
]
\
for
sub_text
in
split_text
),
[])[:
-
1
]
added_tokens
=
list
(
self
.
added_tokens_encoder
.
keys
())
tokenized_text
=
split_on_tokens
(
added_tokens
,
text
)
return
tokenized_text
def
_tokenize
(
self
,
text
,
**
kwargs
):
""" Converts a string in a sequence of tokens (string), using the tokenizer.
Split in words for word-based vocabulary or sub-words for sub-word-based
vocabularies (BPE/SentencePieces/WordPieces).
Don't take care of added tokens.
"""
raise
NotImplementedError
def
decode
(
self
,
token_ids
,
*
input
,
**
kwargs
):
def
convert_tokens_to_ids
(
self
,
tokens
):
""" Converts a single token or a sequence of tokens (str/unicode) in a integer id
(resp.) a sequence of ids, using the vocabulary.
"""
if
isinstance
(
tokens
,
str
)
or
(
six
.
PY2
and
isinstance
(
tokens
,
unicode
)):
return
self
.
convert_token_to_id_with_added_voc
(
tokens
)
ids
=
[]
for
token
in
tokens
:
ids
.
append
(
self
.
convert_token_to_id_with_added_voc
(
token
))
if
len
(
ids
)
>
self
.
max_len
:
logger
.
warning
(
"Token indices sequence length is longer than the specified maximum sequence length "
"for this model ({} > {}). Running this sequence through the model will result in "
"indexing errors"
.
format
(
len
(
ids
),
self
.
max_len
))
return
ids
def
convert_token_to_id_with_added_voc
(
self
,
token
):
if
token
in
self
.
added_tokens_encoder
:
return
self
.
added_tokens_encoder
[
token
]
return
self
.
_convert_token_to_id
(
token
)
def
_convert_token_to_id
(
self
,
token
):
raise
NotImplementedError
def
save_vocabulary
(
self
,
vocab_path
):
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
""" Converts a single index or a sequence of indices (integers) in a token "
(resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.
Args:
skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
"""
if
isinstance
(
ids
,
int
):
return
self
.
convert_id_to_token
(
ids
)
tokens
=
[]
for
index
in
ids
:
if
index
in
self
.
all_special_ids
and
skip_special_tokens
:
continue
if
index
in
self
.
added_tokens_decoder
:
tokens
.
append
(
self
.
added_tokens_decoder
[
index
])
else
:
tokens
.
append
(
self
.
_convert_id_to_token
(
index
))
return
tokens
def
_convert_id_to_token
(
self
,
index
):
raise
NotImplementedError
def
encode
(
self
,
text
):
""" Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
same as self.convert_tokens_to_ids(self.tokenize(text)).
"""
return
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
))
def
decode
(
self
,
token_ids
,
skip_special_tokens
=
False
,
clean_up_tokenization_spaces
=
True
):
""" Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
with options to remove special tokens and clean up tokenization spaces.
"""
filtered_tokens
=
self
.
convert_ids_to_tokens
(
token_ids
,
skip_special_tokens
=
skip_special_tokens
)
text
=
self
.
_convert_ids_to_string
(
filtered_tokens
)
if
clean_up_tokenization_spaces
:
text
=
clean_up_tokenization
(
text
)
return
text
def
_convert_ids_to_string
(
self
,
tokens_ids
):
""" Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary.
roughtly same as ' '.join(self.convert_ids_to_tokens(token_ids)).
"""
return
' '
.
join
(
self
.
convert_ids_to_tokens
(
tokens_ids
))
@
property
def
special_tokens_map
(
self
):
""" A dictionary mapping special token class attribute (cls_token, unk_token...) to their
values ('<unk>', '<cls>'...)
"""
set_attr
=
{}
for
attr
in
self
.
SPECIAL_TOKENS_ATTRIBUTES
:
attr_value
=
getattr
(
self
,
"_"
+
attr
)
if
attr_value
:
set_attr
[
attr
]
=
attr_value
return
set_attr
@
property
def
all_special_tokens
(
self
):
""" List all the special tokens ('<unk>', '<cls>'...) mapped to class attributes
(cls_token, unk_token...).
"""
all_toks
=
[]
set_attr
=
self
.
special_tokens_map
for
attr_value
in
set_attr
.
values
():
all_toks
=
all_toks
+
(
attr_value
if
isinstance
(
attr_value
,
(
list
,
tuple
))
else
[
attr_value
])
all_toks
=
list
(
set
(
all_toks
))
return
all_toks
@
property
def
all_special_ids
(
self
):
""" List the vocabulary indices of the special tokens ('<unk>', '<cls>'...) mapped to
class attributes (cls_token, unk_token...).
"""
all_toks
=
self
.
all_special_tokens
all_ids
=
list
(
self
.
convert_tokens_to_ids
(
t
)
for
t
in
all_toks
)
return
all_ids
def
clean_up_tokenization
(
out_string
):
out_string
.
replace
(
' .'
,
'.'
).
replace
(
' ?'
,
'?'
).
replace
(
' !'
,
'!'
).
replace
(
' ,'
,
','
).
replace
(
" ' "
,
"'"
).
replace
(
" n't"
,
"n't"
).
replace
(
" 'm"
,
"'m"
).
replace
(
" do not"
,
" don't"
...
...
pytorch_transformers/tokenization_xlm.py
View file @
b1978698
...
...
@@ -34,7 +34,6 @@ logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES
=
{
'vocab_file'
:
'vocab.json'
,
'merges_file'
:
'merges.txt'
,
'special_tokens_file'
:
'special_tokens.txt'
}
PRETRAINED_VOCAB_FILES_MAP
=
{
...
...
@@ -46,24 +45,12 @@ PRETRAINED_VOCAB_FILES_MAP = {
{
'xlm-mlm-en-2048'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt"
,
},
'special_tokens_file'
:
{
'xlm-mlm-en-2048'
:
None
,
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
'xlm-mlm-en-2048'
:
512
,
}
INDEX
=
{
"bos_index"
:
0
,
"eos_index"
:
1
,
"pad_index"
:
2
,
"unk_index"
:
3
,
"mask_index"
:
5
}
def
get_pairs
(
word
):
"""
Return set of symbol pairs in a word.
...
...
@@ -103,7 +90,16 @@ class XLMTokenizer(PreTrainedTokenizer):
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def
__init__
(
self
,
vocab_file
,
merges_file
,
special_tokens_file
=
None
,
special_tokens
=
None
,
max_len
=
None
):
def
__init__
(
self
,
vocab_file
,
merges_file
,
unk_token
=
"<unk>"
,
bos_token
=
"<s>"
,
sep_token
=
"</s>"
,
pad_token
=
"<pad>"
,
cls_token
=
"</s>"
,
mask_token
=
"<special1>"
,
additional_special_tokens
=
[
"<special0>"
,
"<special1>"
,
"<special2>"
,
"<special3>"
,
"<special4>"
,
"<special5>"
,
"<special6>"
,
"<special7>"
,
"<special8>"
,
"<special9>"
],
**
kwargs
):
super
(
XLMTokenizer
,
self
).
__init__
(
unk_token
=
unk_token
,
bos_token
=
bos_token
,
sep_token
=
sep_token
,
pad_token
=
pad_token
,
cls_token
=
cls_token
,
mask_token
=
mask_token
,
additional_special_tokens
=
additional_special_tokens
,
**
kwargs
)
try
:
import
ftfy
import
spacy
...
...
@@ -111,11 +107,9 @@ class XLMTokenizer(PreTrainedTokenizer):
self
.
fix_text
=
ftfy
.
fix_text
except
ImportError
:
logger
.
warning
(
"ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy."
)
self
.
nlp
=
BasicTokenizer
(
do_lower_case
=
True
,
never_split
=
special_tokens
if
special_tokens
is
not
None
else
[])
self
.
nlp
=
BasicTokenizer
(
do_lower_case
=
True
)
self
.
fix_text
=
None
self
.
max_len
=
max_len
if
max_len
is
not
None
else
int
(
1e12
)
self
.
encoder
=
json
.
load
(
open
(
vocab_file
,
encoding
=
"utf-8"
))
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
merges
=
open
(
merges_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[:
-
1
]
...
...
@@ -123,35 +117,9 @@ class XLMTokenizer(PreTrainedTokenizer):
self
.
bpe_ranks
=
dict
(
zip
(
merges
,
range
(
len
(
merges
))))
self
.
cache
=
{}
all_special_tokens
=
[]
if
special_tokens_file
is
not
None
:
special_tokens_to_add
=
open
(
special_tokens_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[:
-
1
]
all_special_tokens
.
extend
(
special_tokens_to_add
)
if
special_tokens
is
not
None
and
special_tokens
:
all_special_tokens
.
extend
(
special_tokens
)
self
.
special_tokens
=
{}
self
.
special_tokens_decoder
=
{}
self
.
set_special_tokens
(
all_special_tokens
)
def
__len__
(
self
):
return
len
(
self
.
encoder
)
+
len
(
self
.
special_tokens
)
def
set_special_tokens
(
self
,
special_tokens
):
""" Add a list of additional tokens to the encoder.
The additional tokens are indexed starting from the last index of the
current vocabulary in the order of the `special_tokens` list.
"""
if
not
special_tokens
:
self
.
special_tokens
=
{}
self
.
special_tokens_decoder
=
{}
return
self
.
special_tokens
=
dict
((
tok
,
len
(
self
.
encoder
)
+
i
)
for
i
,
tok
in
enumerate
(
special_tokens
))
self
.
special_tokens_decoder
=
{
v
:
k
for
k
,
v
in
self
.
special_tokens
.
items
()}
if
self
.
fix_text
is
None
:
# Using BERT's BasicTokenizer: we can update the tokenizer
self
.
nlp
.
never_split
=
special_tokens
logger
.
info
(
"Special tokens {}"
.
format
(
self
.
special_tokens
))
@
property
def
vocab_size
(
self
):
return
len
(
self
.
encoder
)
def
bpe
(
self
,
token
):
word
=
tuple
(
token
[:
-
1
])
+
(
token
[
-
1
]
+
'</w>'
,)
...
...
@@ -196,7 +164,7 @@ class XLMTokenizer(PreTrainedTokenizer):
self
.
cache
[
token
]
=
word
return
word
def
tokenize
(
self
,
text
):
def
_
tokenize
(
self
,
text
):
""" Tokenize a string. """
split_tokens
=
[]
if
self
.
fix_text
is
None
:
...
...
@@ -211,58 +179,26 @@ class XLMTokenizer(PreTrainedTokenizer):
split_tokens
.
extend
([
t
for
t
in
self
.
bpe
(
token
.
text
.
lower
()).
split
(
' '
)])
return
split_tokens
def
convert_tokens_to_ids
(
self
,
tokens
):
""" Converts a sequence of tokens into ids using the vocab. """
ids
=
[]
if
isinstance
(
tokens
,
str
)
or
(
sys
.
version_info
[
0
]
==
2
and
isinstance
(
tokens
,
unicode
)):
if
tokens
in
self
.
special_tokens
:
return
self
.
special_tokens
[
tokens
]
else
:
return
self
.
encoder
.
get
(
tokens
,
0
)
for
token
in
tokens
:
if
token
in
self
.
special_tokens
:
ids
.
append
(
self
.
special_tokens
[
token
])
else
:
ids
.
append
(
self
.
encoder
.
get
(
token
,
0
))
if
len
(
ids
)
>
self
.
max_len
:
logger
.
warning
(
"Token indices sequence length is longer than the specified maximum "
" sequence length for this OpenAI GPT model ({} > {}). Running this"
" sequence through the model will result in indexing errors"
.
format
(
len
(
ids
),
self
.
max_len
)
)
return
ids
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
"""Converts a sequence of ids in BPE tokens using the vocab."""
tokens
=
[]
for
i
in
ids
:
if
i
in
self
.
special_tokens_decoder
:
if
not
skip_special_tokens
:
tokens
.
append
(
self
.
special_tokens_decoder
[
i
])
else
:
tokens
.
append
(
self
.
decoder
[
i
])
return
tokens
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str/unicode) in an id using the vocab. """
return
self
.
encoder
.
get
(
token
,
self
.
encoder
.
get
(
self
.
unk_token
))
def
encode
(
self
,
text
):
return
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
))
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
return
self
.
decoder
.
get
(
index
,
self
.
unk_token
)
def
decode
(
self
,
ids
,
skip_special_tokens
=
False
,
clean_up_tokenization_spaces
=
True
):
def
_convert_ids_to_string
(
self
,
tokens_ids
):
"""Converts a sequence of ids in a string."""
tokens
=
self
.
convert_ids_to_tokens
(
ids
,
skip_special_tokens
=
skip_special_tokens
)
out_string
=
''
.
join
(
tokens
).
replace
(
'</w>'
,
' '
).
strip
()
if
clean_up_tokenization_spaces
:
out_string
=
out_string
.
replace
(
'<unk>'
,
''
)
out_string
=
clean_up_tokenization
(
out_string
)
out_string
=
''
.
join
(
tokens_ids
).
replace
(
'</w>'
,
' '
).
strip
()
return
out_string
def
save_vocabulary
(
self
,
vocab_path
):
def
save_vocabulary
(
self
,
save_directory
):
"""Save the tokenizer vocabulary and merge files to a directory."""
if
not
os
.
path
.
isdir
(
vocab_path
):
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
vocab_path
))
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
save_directory
))
return
vocab_file
=
os
.
path
.
join
(
vocab_path
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
merge_file
=
os
.
path
.
join
(
vocab_path
,
VOCAB_FILES_NAMES
[
'merges_file'
])
special_tokens_file
=
os
.
path
.
join
(
vocab_path
,
VOCAB_FILES_NAMES
[
'special_tokens_file'
])
vocab_file
=
os
.
path
.
join
(
save_directory
,
VOCAB_FILES_NAMES
[
'vocab_file'
])
merge_file
=
os
.
path
.
join
(
save_directory
,
VOCAB_FILES_NAMES
[
'merges_file'
])
with
open
(
vocab_file
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
f
.
write
(
json
.
dumps
(
self
.
encoder
,
ensure_ascii
=
False
))
...
...
@@ -277,14 +213,4 @@ class XLMTokenizer(PreTrainedTokenizer):
writer
.
write
(
' '
.
join
(
bpe_tokens
)
+
u
'
\n
'
)
index
+=
1
index
=
len
(
self
.
encoder
)
with
open
(
special_tokens_file
,
'w'
,
encoding
=
'utf-8'
)
as
writer
:
for
token
,
token_index
in
sorted
(
self
.
special_tokens
.
items
(),
key
=
lambda
kv
:
kv
[
1
]):
if
index
!=
token_index
:
logger
.
warning
(
"Saving special tokens vocabulary to {}: BPE indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
.
format
(
special_tokens_file
))
index
=
token_index
writer
.
write
(
token
+
u
'
\n
'
)
index
+=
1
return
vocab_file
,
merge_file
,
special_tokens_file
return
vocab_file
,
merge_file
pytorch_transformers/tokenization_xlnet.py
View file @
b1978698
...
...
@@ -16,17 +16,13 @@
from
__future__
import
(
absolute_import
,
division
,
print_function
,
unicode_literals
)
import
json
import
logging
import
os
import
sys
from
shutil
import
copyfile
from
io
import
open
import
unicodedata
import
six
from
.file_utils
import
cached_path
from
.tokenization_utils
import
PreTrainedTokenizer
,
clean_up_tokenization
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -44,8 +40,6 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'xlnet-large-cased'
:
512
,
}
VOCAB_NAME
=
'spiece.model'
SPIECE_UNDERLINE
=
u
'▁'
# Segments (not really needed)
...
...
@@ -60,31 +54,26 @@ class XLNetTokenizer(PreTrainedTokenizer):
SentencePiece based tokenizer. Peculiarities:
- requires SentencePiece: https://github.com/google/sentencepiece
"""
# Tokens
special_symbols
=
{
"<unk>"
:
0
,
"<s>"
:
1
,
"</s>"
:
2
,
"<cls>"
:
3
,
"<sep>"
:
4
,
"<pad>"
:
5
,
"<mask>"
:
6
,
"<eod>"
:
7
,
"<eop>"
:
8
,
}
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def
__init__
(
self
,
vocab_file
,
max_len
=
None
,
do_lower_case
=
False
,
remove_space
=
True
,
keep_accents
=
False
):
do_lower_case
=
False
,
remove_space
=
True
,
keep_accents
=
False
,
bos_token
=
"<s>"
,
eos_token
=
"</s>"
,
unk_token
=
"<unk>"
,
sep_token
=
"<sep>"
,
pad_token
=
"<pad>"
,
cls_token
=
"<cls>"
,
mask_token
=
"<mask>"
,
additional_special_tokens
=
[
"<eop>"
,
"<eod>"
],
**
kwargs
):
super
(
XLNetTokenizer
,
self
).
__init__
(
bos_token
=
bos_token
,
eos_token
=
eos_token
,
unk_token
=
unk_token
,
sep_token
=
sep_token
,
pad_token
=
pad_token
,
cls_token
=
cls_token
,
mask_token
=
mask_token
,
additional_special_tokens
=
additional_special_tokens
,
**
kwargs
)
try
:
import
sentencepiece
as
spm
except
ImportError
:
logger
.
warning
(
"You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece"
)
self
.
max_len
=
max_len
if
max_len
is
not
None
else
int
(
1e12
)
self
.
do_lower_case
=
do_lower_case
self
.
remove_space
=
remove_space
self
.
keep_accents
=
keep_accents
...
...
@@ -94,46 +83,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
self
.
sp_model
.
Load
(
vocab_file
)
@
property
def
UNK_TOKEN
(
self
):
return
"<unk>"
@
property
def
SEP_TOKEN
(
self
):
return
"<sep>"
@
property
def
PAD_TOKEN
(
self
):
return
"<pad>"
@
property
def
CLS_TOKEN
(
self
):
return
"<cls>"
@
property
def
MASK_TOKEN
(
self
):
return
"<mask>"
@
property
def
UNK_ID
(
self
):
return
self
.
special_symbols
[
"<unk>"
]
@
property
def
SEP_ID
(
self
):
return
self
.
special_symbols
[
"<sep>"
]
@
property
def
PAD_ID
(
self
):
return
self
.
special_symbols
[
"<pad>"
]
@
property
def
CLS_ID
(
self
):
return
self
.
special_symbols
[
"<cls>"
]
@
property
def
MASK_ID
(
self
):
return
self
.
special_symbols
[
"<mask>"
]
def
__len__
(
self
):
def
vocab_size
(
self
):
return
len
(
self
.
sp_model
)
def
__getstate__
(
self
):
...
...
@@ -169,7 +119,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
return
outputs
def
tokenize
(
self
,
text
,
return_unicode
=
True
,
sample
=
False
):
def
_
tokenize
(
self
,
text
,
return_unicode
=
True
,
sample
=
False
):
""" Tokenize a string.
return_unicode is used only for py2
"""
...
...
@@ -208,56 +158,30 @@ class XLNetTokenizer(PreTrainedTokenizer):
return
new_pieces
def
convert_tokens_to_ids
(
self
,
tokens
,
sample
=
False
):
""" Converts a sequence of tokens into ids using the vocab. """
ids
=
[]
if
isinstance
(
tokens
,
str
)
or
(
sys
.
version_info
[
0
]
==
2
and
isinstance
(
tokens
,
unicode
)):
return
self
.
sp_model
.
PieceToId
(
tokens
)
for
token
in
tokens
:
ids
.
append
(
self
.
sp_model
.
PieceToId
(
token
))
if
len
(
ids
)
>
self
.
max_len
:
logger
.
warning
(
"Token indices sequence length is longer than the specified maximum "
" sequence length for this XLNet model ({} > {}). Running this"
" sequence through the model will result in indexing errors"
.
format
(
len
(
ids
),
self
.
max_len
)
)
return
ids
def
convert_ids_to_tokens
(
self
,
ids
,
return_unicode
=
True
):
"""Converts a sequence of ids in tokens."""
tokens
=
[]
for
i
in
ids
:
tokens
.
append
(
self
.
sp_model
.
IdToPiece
(
i
))
if
six
.
PY2
and
return_unicode
:
ret_pieces
=
[]
for
piece
in
tokens
:
if
isinstance
(
piece
,
str
):
piece
=
piece
.
decode
(
'utf-8'
)
ret_pieces
.
append
(
piece
)
tokens
=
ret_pieces
return
tokens
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str/unicode) in an id using the vocab. """
return
self
.
sp_model
.
PieceToId
(
token
)
def
encode
(
self
,
text
,
sample
=
False
):
return
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
,
sample
=
sample
))
def
_convert_id_to_token
(
self
,
index
,
return_unicode
=
True
):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
token
=
self
.
sp_model
.
IdToPiece
(
index
)
if
six
.
PY2
and
return_unicode
and
isinstance
(
token
,
str
):
token
=
token
.
decode
(
'utf-8'
)
return
token
def
decode
(
self
,
ids
,
clean_up_tokenization_spaces
=
True
):
def
_convert_ids_to_string
(
self
,
tokens_ids
):
"""Converts a sequence of ids in a string."""
tokens
=
self
.
convert_ids_to_tokens
(
ids
)
out_string
=
''
.
join
(
tokens
)
if
clean_up_tokenization_spaces
:
out_string
=
out_string
.
strip
().
replace
(
'<unk>'
,
''
)
out_string
=
clean_up_tokenization
(
out_string
)
out_string
=
''
.
join
(
tokens_ids
)
return
out_string
def
save_vocabulary
(
self
,
vocab_path
):
def
save_vocabulary
(
self
,
save_directory
):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
"""
if
not
os
.
path
.
isdir
(
vocab_path
):
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
vocab_path
))
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
save_directory
))
return
out_vocab_file
=
os
.
path
.
join
(
vocab_path
,
VOCAB_NAME
)
out_vocab_file
=
os
.
path
.
join
(
save_directory
,
VOCAB_FILES_NAMES
[
'vocab_file'
]
)
copyfile
(
self
.
vocab_file
,
out_vocab_file
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment