Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
386a93f0
"examples/text-generation/vscode:/vscode.git/clone" did not exist on "12d0eb5f3e52e43b12b7592a2fdb1d31a50245ea"
Unverified
Commit
386a93f0
authored
Dec 12, 2019
by
Thomas Wolf
Committed by
GitHub
Dec 12, 2019
Browse files
Merge branch 'master' into from-pretrained-from-url
parents
4f15e5a2
2d103546
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
572 additions
and
37 deletions
+572
-37
.circleci/config.yml
.circleci/config.yml
+23
-0
docs/source/pretrained_models.rst
docs/source/pretrained_models.rst
+26
-8
transformers/__init__.py
transformers/__init__.py
+1
-0
transformers/configuration_bert.py
transformers/configuration_bert.py
+4
-0
transformers/modeling_bert.py
transformers/modeling_bert.py
+6
-2
transformers/modeling_tf_bert.py
transformers/modeling_tf_bert.py
+10
-6
transformers/tests/tokenization_bert_japanese_test.py
transformers/tests/tokenization_bert_japanese_test.py
+191
-0
transformers/tests/utils.py
transformers/tests/utils.py
+31
-5
transformers/tokenization_auto.py
transformers/tokenization_auto.py
+4
-0
transformers/tokenization_bert_japanese.py
transformers/tokenization_bert_japanese.py
+253
-0
transformers/tokenization_utils.py
transformers/tokenization_utils.py
+23
-16
No files found.
.circleci/config.yml
View file @
386a93f0
...
...
@@ -70,6 +70,27 @@ jobs:
-
run
:
sudo pip install pytest codecov pytest-cov
-
run
:
python -m pytest -sv ./transformers/tests/ --cov
-
run
:
codecov
build_py3_custom_tokenizers
:
working_directory
:
~/transformers
docker
:
-
image
:
circleci/python:3.5
steps
:
-
checkout
-
run
:
sudo pip install --progress-bar off .
-
run
:
sudo pip install pytest
-
run
:
sudo pip install mecab-python3
-
run
:
RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py
build_py2_custom_tokenizers
:
working_directory
:
~/transformers
docker
:
-
image
:
circleci/python:2.7
steps
:
-
checkout
-
run
:
sudo pip install --progress-bar off .
-
run
:
sudo pip install pytest
-
run
:
sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
-
run
:
sudo pip install mecab-python
-
run
:
RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py
deploy_doc
:
working_directory
:
~/transformers
docker
:
...
...
@@ -91,6 +112,8 @@ workflows:
version
:
2
build_and_test
:
jobs
:
-
build_py3_custom_tokenizers
-
build_py2_custom_tokenizers
-
build_py3_torch_and_tf
-
build_py3_torch
-
build_py3_tf
...
...
docs/source/pretrained_models.rst
View file @
386a93f0
...
...
@@ -61,6 +61,24 @@ Here is the full list of the currently provided pretrained models together with
| | ``bert-base-german-dbmdz-uncased`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | Trained on uncased German text by DBMDZ |
| | | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__). |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``bert-base-japanese`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece. |
| | | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization. |
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``bert-base-japanese-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece. |
| | | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization. |
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``bert-base-japanese-char`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | Trained on Japanese text. Text is tokenized into characters. |
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``bert-base-japanese-char-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters. |
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| GPT | ``openai-gpt`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | OpenAI GPT English model |
...
...
transformers/__init__.py
View file @
386a93f0
...
...
@@ -37,6 +37,7 @@ if is_sklearn_available():
from
.tokenization_utils
import
(
PreTrainedTokenizer
)
from
.tokenization_auto
import
AutoTokenizer
from
.tokenization_bert
import
BertTokenizer
,
BasicTokenizer
,
WordpieceTokenizer
from
.tokenization_bert_japanese
import
BertJapaneseTokenizer
,
MecabTokenizer
,
CharacterTokenizer
from
.tokenization_openai
import
OpenAIGPTTokenizer
from
.tokenization_transfo_xl
import
(
TransfoXLTokenizer
,
TransfoXLCorpus
)
from
.tokenization_gpt2
import
GPT2Tokenizer
...
...
transformers/configuration_bert.py
View file @
386a93f0
...
...
@@ -42,6 +42,10 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
'bert-base-cased-finetuned-mrpc'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json"
,
'bert-base-german-dbmdz-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json"
,
'bert-base-german-dbmdz-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json"
,
'bert-base-japanese'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json"
,
'bert-base-japanese-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json"
,
'bert-base-japanese-char'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json"
,
'bert-base-japanese-char-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json"
}
...
...
transformers/modeling_bert.py
View file @
386a93f0
...
...
@@ -48,6 +48,10 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
'bert-base-cased-finetuned-mrpc'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin"
,
'bert-base-german-dbmdz-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin"
,
'bert-base-german-dbmdz-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin"
,
'bert-base-japanese'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin"
,
'bert-base-japanese-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin"
,
'bert-base-japanese-char'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin"
,
'bert-base-japanese-char-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin"
}
...
...
transformers/modeling_tf_bert.py
View file @
386a93f0
...
...
@@ -48,6 +48,10 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
'bert-large-uncased-whole-word-masking-finetuned-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5"
,
'bert-large-cased-whole-word-masking-finetuned-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5"
,
'bert-base-cased-finetuned-mrpc'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5"
,
'bert-base-japanese'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5"
,
'bert-base-japanese-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5"
,
'bert-base-japanese-char'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5"
,
'bert-base-japanese-char-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5"
}
...
...
transformers/tests/tokenization_bert_japanese_test.py
0 → 100644
View file @
386a93f0
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
from
io
import
open
from
transformers.tokenization_bert
import
WordpieceTokenizer
from
transformers.tokenization_bert_japanese
import
(
BertJapaneseTokenizer
,
MecabTokenizer
,
CharacterTokenizer
,
VOCAB_FILES_NAMES
)
from
.tokenization_tests_commons
import
CommonTestCases
from
.utils
import
slow
,
custom_tokenizers
@
custom_tokenizers
class
BertJapaneseTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
tokenizer_class
=
BertJapaneseTokenizer
def
setUp
(
self
):
super
(
BertJapaneseTokenizationTest
,
self
).
setUp
()
vocab_tokens
=
[
u
"[UNK]"
,
u
"[CLS]"
,
u
"[SEP]"
,
u
"こんにちは"
,
u
"こん"
,
u
"にちは"
,
u
"ばんは"
,
u
"##こん"
,
u
"##にちは"
,
u
"##ばんは"
,
u
"世界"
,
u
"##世界"
,
u
"、"
,
u
"##、"
,
u
"。"
,
u
"##。"
]
self
.
vocab_file
=
os
.
path
.
join
(
self
.
tmpdirname
,
VOCAB_FILES_NAMES
[
"vocab_file"
])
with
open
(
self
.
vocab_file
,
"w"
,
encoding
=
"utf-8"
)
as
vocab_writer
:
vocab_writer
.
write
(
""
.
join
([
x
+
"
\n
"
for
x
in
vocab_tokens
]))
def
get_tokenizer
(
self
,
**
kwargs
):
return
BertJapaneseTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
kwargs
)
def
get_input_output_texts
(
self
):
input_text
=
u
"こんにちは、世界。
\n
こんばんは、世界。"
output_text
=
u
"こんにちは 、 世界 。 こんばんは 、 世界 。"
return
input_text
,
output_text
def
test_full_tokenizer
(
self
):
tokenizer
=
self
.
tokenizer_class
(
self
.
vocab_file
)
tokens
=
tokenizer
.
tokenize
(
u
"こんにちは、世界。
\n
こんばんは、世界。"
)
self
.
assertListEqual
(
tokens
,
[
u
"こんにちは"
,
u
"、"
,
u
"世界"
,
u
"。"
,
u
"こん"
,
u
"##ばんは"
,
u
"、"
,
u
"世界"
,
"。"
])
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
tokens
),
[
3
,
12
,
10
,
14
,
4
,
9
,
12
,
10
,
14
])
def
test_mecab_tokenizer
(
self
):
tokenizer
=
MecabTokenizer
()
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"
\t
アップルストアでiPhone8 が
\n
発売された 。 "
),
[
u
"アップルストア"
,
u
"で"
,
u
"iPhone"
,
u
"8"
,
u
"が"
,
u
"発売"
,
u
"さ"
,
u
"れ"
,
u
"た"
,
u
"。"
])
def
test_mecab_tokenizer_lower
(
self
):
tokenizer
=
MecabTokenizer
(
do_lower_case
=
True
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"
\t
アップルストアでiPhone8 が
\n
発売された 。 "
),
[
u
"アップルストア"
,
u
"で"
,
u
"iphone"
,
u
"8"
,
u
"が"
,
u
"発売"
,
u
"さ"
,
u
"れ"
,
u
"た"
,
u
"。"
])
def
test_mecab_tokenizer_no_normalize
(
self
):
tokenizer
=
MecabTokenizer
(
normalize_text
=
False
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"
\t
アップルストアでiPhone8 が
\n
発売された 。 "
),
[
u
"アップルストア"
,
u
"で"
,
u
"iPhone"
,
u
"8"
,
u
"が"
,
u
"発売"
,
u
"さ"
,
u
"れ"
,
u
"た"
,
u
" "
,
u
"。"
])
def
test_wordpiece_tokenizer
(
self
):
vocab_tokens
=
[
u
"[UNK]"
,
u
"[CLS]"
,
u
"[SEP]"
,
u
"こんにちは"
,
u
"こん"
,
u
"にちは"
u
"ばんは"
,
u
"##こん"
,
u
"##にちは"
,
u
"##ばんは"
]
vocab
=
{}
for
(
i
,
token
)
in
enumerate
(
vocab_tokens
):
vocab
[
token
]
=
i
tokenizer
=
WordpieceTokenizer
(
vocab
=
vocab
,
unk_token
=
u
"[UNK]"
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
""
),
[])
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"こんにちは"
),
[
u
"こんにちは"
])
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"こんばんは"
),
[
u
"こん"
,
u
"##ばんは"
])
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"こんばんは こんばんにちは こんにちは"
),
[
u
"こん"
,
u
"##ばんは"
,
u
"[UNK]"
,
u
"こんにちは"
])
@
slow
def
test_sequence_builders
(
self
):
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
"bert-base-japanese"
)
text
=
tokenizer
.
encode
(
u
"ありがとう。"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
u
"どういたしまして。"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
# 2 is for "[CLS]", 3 is for "[SEP]"
assert
encoded_sentence
==
[
2
]
+
text
+
[
3
]
assert
encoded_pair
==
[
2
]
+
text
+
[
3
]
+
text_2
+
[
3
]
class
BertJapaneseCharacterTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
tokenizer_class
=
BertJapaneseTokenizer
def
setUp
(
self
):
super
(
BertJapaneseCharacterTokenizationTest
,
self
).
setUp
()
vocab_tokens
=
[
u
"[UNK]"
,
u
"[CLS]"
,
u
"[SEP]"
,
u
"こ"
,
u
"ん"
,
u
"に"
,
u
"ち"
,
u
"は"
,
u
"ば"
,
u
"世"
,
u
"界"
,
u
"、"
,
u
"。"
]
self
.
vocab_file
=
os
.
path
.
join
(
self
.
tmpdirname
,
VOCAB_FILES_NAMES
[
"vocab_file"
])
with
open
(
self
.
vocab_file
,
"w"
,
encoding
=
"utf-8"
)
as
vocab_writer
:
vocab_writer
.
write
(
""
.
join
([
x
+
"
\n
"
for
x
in
vocab_tokens
]))
def
get_tokenizer
(
self
,
**
kwargs
):
return
BertJapaneseTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
subword_tokenizer_type
=
"character"
,
**
kwargs
)
def
get_input_output_texts
(
self
):
input_text
=
u
"こんにちは、世界。
\n
こんばんは、世界。"
output_text
=
u
"こ ん に ち は 、 世 界 。 こ ん ば ん は 、 世 界 。"
return
input_text
,
output_text
def
test_full_tokenizer
(
self
):
tokenizer
=
self
.
tokenizer_class
(
self
.
vocab_file
,
subword_tokenizer_type
=
"character"
)
tokens
=
tokenizer
.
tokenize
(
u
"こんにちは、世界。
\n
こんばんは、世界。"
)
self
.
assertListEqual
(
tokens
,
[
u
"こ"
,
u
"ん"
,
u
"に"
,
u
"ち"
,
u
"は"
,
u
"、"
,
u
"世"
,
u
"界"
,
u
"。"
,
u
"こ"
,
u
"ん"
,
u
"ば"
,
u
"ん"
,
u
"は"
,
u
"、"
,
u
"世"
,
u
"界"
,
u
"。"
])
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
tokens
),
[
3
,
4
,
5
,
6
,
7
,
11
,
9
,
10
,
12
,
3
,
4
,
8
,
4
,
7
,
11
,
9
,
10
,
12
])
def
test_character_tokenizer
(
self
):
vocab_tokens
=
[
u
"[UNK]"
,
u
"[CLS]"
,
u
"[SEP]"
,
u
"こ"
,
u
"ん"
,
u
"に"
,
u
"ち"
,
u
"は"
,
u
"ば"
,
u
"世"
,
u
"界"
u
"、"
,
u
"。"
]
vocab
=
{}
for
(
i
,
token
)
in
enumerate
(
vocab_tokens
):
vocab
[
token
]
=
i
tokenizer
=
CharacterTokenizer
(
vocab
=
vocab
,
unk_token
=
u
"[UNK]"
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
""
),
[])
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"こんにちは"
),
[
u
"こ"
,
u
"ん"
,
u
"に"
,
u
"ち"
,
u
"は"
])
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"こんにちほ"
),
[
u
"こ"
,
u
"ん"
,
u
"に"
,
u
"ち"
,
u
"[UNK]"
])
@
slow
def
test_sequence_builders
(
self
):
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
"bert-base-japanese-char"
)
text
=
tokenizer
.
encode
(
u
"ありがとう。"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
u
"どういたしまして。"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
# 2 is for "[CLS]", 3 is for "[SEP]"
assert
encoded_sentence
==
[
2
]
+
text
+
[
3
]
assert
encoded_pair
==
[
2
]
+
text
+
[
3
]
+
text_2
+
[
3
]
transformers/tests/utils.py
View file @
386a93f0
...
...
@@ -16,11 +16,24 @@ except KeyError:
_run_slow_tests
=
False
else
:
# RUN_SLOW is set, convert it to True or False.
def
parse_flag_from_env
(
key
,
default
=
False
):
try
:
value
=
os
.
environ
[
key
]
except
KeyError
:
# KEY isn't set, default to `default`.
_value
=
default
else
:
# KEY is set, convert it to True or False.
try
:
_run_slow_tests
=
strtobool
(
run_slow
)
_value
=
strtobool
(
value
)
except
ValueError
:
# More values are supported, but let's keep the message simple.
raise
ValueError
(
"If set, RUN_SLOW must be yes or no."
)
raise
ValueError
(
"If set, {} must be yes or no."
.
format
(
key
))
return
_value
_run_slow_tests
=
parse_flag_from_env
(
"RUN_SLOW"
,
default
=
False
)
_run_custom_tokenizers
=
parse_flag_from_env
(
"RUN_CUSTOM_TOKENIZERS"
,
default
=
False
)
def
slow
(
test_case
):
...
...
@@ -36,6 +49,19 @@ def slow(test_case):
return
test_case
def
custom_tokenizers
(
test_case
):
"""
Decorator marking a test for a custom tokenizer.
Custom tokenizers require additional dependencies, and are skipped
by default. Set the RUN_CUSTOM_TOKENIZERS environment variable
to a truthy value to run them.
"""
if
not
_run_custom_tokenizers
:
test_case
=
unittest
.
skip
(
"test of custom tokenizers"
)(
test_case
)
return
test_case
def
require_torch
(
test_case
):
"""
Decorator marking a test that requires PyTorch.
...
...
@@ -62,6 +88,6 @@ def require_tf(test_case):
if
_torch_available
:
# Set the USE_CUDA environment variable to select a GPU.
torch_device
=
"cuda"
if
os
.
environ
.
get
(
"USE_CUDA"
)
else
"cpu"
torch_device
=
"cuda"
if
parse_flag_from_env
(
"USE_CUDA"
)
else
"cpu"
else
:
torch_device
=
None
transformers/tokenization_auto.py
View file @
386a93f0
...
...
@@ -19,6 +19,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
logging
from
.tokenization_bert
import
BertTokenizer
from
.tokenization_bert_japanese
import
BertJapaneseTokenizer
from
.tokenization_openai
import
OpenAIGPTTokenizer
from
.tokenization_gpt2
import
GPT2Tokenizer
from
.tokenization_ctrl
import
CTRLTokenizer
...
...
@@ -72,6 +73,7 @@ class AutoTokenizer(object):
- contains `albert`: AlbertTokenizer (ALBERT model)
- contains `camembert`: CamembertTokenizer (CamemBERT model)
- contains `roberta`: RobertaTokenizer (RoBERTa model)
- contains `bert-base-japanese`: BertJapaneseTokenizer (Bert model)
- contains `bert`: BertTokenizer (Bert model)
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
...
...
@@ -118,6 +120,8 @@ class AutoTokenizer(object):
return
CamembertTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'roberta'
in
pretrained_model_name_or_path
:
return
RobertaTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'bert-base-japanese'
in
pretrained_model_name_or_path
:
return
BertJapaneseTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'bert'
in
pretrained_model_name_or_path
:
return
BertTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'openai-gpt'
in
pretrained_model_name_or_path
:
...
...
transformers/tokenization_bert_japanese.py
0 → 100644
View file @
386a93f0
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
collections
import
logging
import
os
import
six
import
unicodedata
from
io
import
open
from
.tokenization_bert
import
BertTokenizer
,
BasicTokenizer
,
WordpieceTokenizer
,
load_vocab
from
.tokenization_utils
import
PreTrainedTokenizer
logger
=
logging
.
getLogger
(
__name__
)
VOCAB_FILES_NAMES
=
{
'vocab_file'
:
'vocab.txt'
}
PRETRAINED_VOCAB_FILES_MAP
=
{
'vocab_file'
:
{
'bert-base-japanese'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-vocab.txt"
,
'bert-base-japanese-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-vocab.txt"
,
'bert-base-japanese-char'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-vocab.txt"
,
'bert-base-japanese-char-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-vocab.txt"
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
'bert-base-japanese'
:
512
,
'bert-base-japanese-whole-word-masking'
:
512
,
'bert-base-japanese-char'
:
512
,
'bert-base-japanese-char-whole-word-masking'
:
512
}
PRETRAINED_INIT_CONFIGURATION
=
{
'bert-base-japanese'
:
{
'do_lower_case'
:
False
,
'word_tokenizer_type'
:
'mecab'
,
'subword_tokenizer_type'
:
'wordpiece'
},
'bert-base-japanese-whole-word-masking'
:{
'do_lower_case'
:
False
,
'word_tokenizer_type'
:
'mecab'
,
'subword_tokenizer_type'
:
'wordpiece'
},
'bert-base-japanese-char'
:
{
'do_lower_case'
:
False
,
'word_tokenizer_type'
:
'mecab'
,
'subword_tokenizer_type'
:
'character'
},
'bert-base-japanese-char-whole-word-masking'
:
{
'do_lower_case'
:
False
,
'word_tokenizer_type'
:
'mecab'
,
'subword_tokenizer_type'
:
'character'
}
}
class
BertJapaneseTokenizer
(
BertTokenizer
):
"""BERT tokenizer for Japanese text"""
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration
=
PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def
__init__
(
self
,
vocab_file
,
do_lower_case
=
False
,
do_word_tokenize
=
True
,
do_subword_tokenize
=
True
,
word_tokenizer_type
=
'basic'
,
subword_tokenizer_type
=
'wordpiece'
,
never_split
=
None
,
unk_token
=
'[UNK]'
,
sep_token
=
'[SEP]'
,
pad_token
=
'[PAD]'
,
cls_token
=
'[CLS]'
,
mask_token
=
'[MASK]'
,
**
kwargs
):
"""Constructs a MecabBertTokenizer.
Args:
**vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
**do_lower_case**: (`optional`) boolean (default True)
Whether to lower case the input.
Only has an effect when do_basic_tokenize=True.
**do_word_tokenize**: (`optional`) boolean (default True)
Whether to do word tokenization.
**do_subword_tokenize**: (`optional`) boolean (default True)
Whether to do subword tokenization.
**word_tokenizer_type**: (`optional`) string (default "basic")
Type of word tokenizer.
**subword_tokenizer_type**: (`optional`) string (default "wordpiece")
Type of subword tokenizer.
"""
super
(
BertTokenizer
,
self
).
__init__
(
unk_token
=
unk_token
,
sep_token
=
sep_token
,
pad_token
=
pad_token
,
cls_token
=
cls_token
,
mask_token
=
mask_token
,
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
3
# take into account special tokens
if
not
os
.
path
.
isfile
(
vocab_file
):
raise
ValueError
(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
.
format
(
vocab_file
))
self
.
vocab
=
load_vocab
(
vocab_file
)
self
.
ids_to_tokens
=
collections
.
OrderedDict
(
[(
ids
,
tok
)
for
tok
,
ids
in
self
.
vocab
.
items
()])
self
.
do_word_tokenize
=
do_word_tokenize
if
do_word_tokenize
:
if
word_tokenizer_type
==
'basic'
:
self
.
word_tokenizer
=
BasicTokenizer
(
do_lower_case
=
do_lower_case
,
never_split
=
never_split
,
tokenize_chinese_chars
=
False
)
elif
word_tokenizer_type
==
'mecab'
:
self
.
word_tokenizer
=
MecabTokenizer
(
do_lower_case
=
do_lower_case
,
never_split
=
never_split
)
else
:
raise
ValueError
(
"Invalid word_tokenizer_type '{}' is specified."
.
format
(
word_tokenizer_type
))
self
.
do_subword_tokenize
=
do_subword_tokenize
if
do_subword_tokenize
:
if
subword_tokenizer_type
==
'wordpiece'
:
self
.
subword_tokenizer
=
WordpieceTokenizer
(
vocab
=
self
.
vocab
,
unk_token
=
self
.
unk_token
)
elif
subword_tokenizer_type
==
'character'
:
self
.
subword_tokenizer
=
CharacterTokenizer
(
vocab
=
self
.
vocab
,
unk_token
=
self
.
unk_token
)
else
:
raise
ValueError
(
"Invalid subword_tokenizer_type '{}' is specified."
.
format
(
subword_tokenizer_type
))
def
_tokenize
(
self
,
text
):
if
self
.
do_word_tokenize
:
tokens
=
self
.
word_tokenizer
.
tokenize
(
text
,
never_split
=
self
.
all_special_tokens
)
else
:
tokens
=
[
text
]
if
self
.
do_subword_tokenize
:
split_tokens
=
[
sub_token
for
token
in
tokens
for
sub_token
in
self
.
subword_tokenizer
.
tokenize
(
token
)]
else
:
split_tokens
=
tokens
return
split_tokens
class
MecabTokenizer
(
object
):
"""Runs basic tokenization with MeCab morphological parser."""
def
__init__
(
self
,
do_lower_case
=
False
,
never_split
=
None
,
normalize_text
=
True
):
"""Constructs a MecabTokenizer.
Args:
**do_lower_case**: (`optional`) boolean (default True)
Whether to lower case the input.
**never_split**: (`optional`) list of str
Kept for backward compatibility purposes.
Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
List of token not to split.
**normalize_text**: (`optional`) boolean (default True)
Whether to apply unicode normalization to text before tokenization.
"""
self
.
do_lower_case
=
do_lower_case
self
.
never_split
=
never_split
if
never_split
is
not
None
else
[]
self
.
normalize_text
=
normalize_text
import
MeCab
self
.
mecab
=
MeCab
.
Tagger
()
def
tokenize
(
self
,
text
,
never_split
=
None
,
**
kwargs
):
"""Tokenizes a piece of text."""
if
self
.
normalize_text
:
text
=
unicodedata
.
normalize
(
'NFKC'
,
text
)
never_split
=
self
.
never_split
+
(
never_split
if
never_split
is
not
None
else
[])
tokens
=
[]
if
six
.
PY2
:
mecab_output
=
self
.
mecab
.
parse
(
text
.
encode
(
'utf-8'
)).
decode
(
'utf-8'
)
else
:
mecab_output
=
self
.
mecab
.
parse
(
text
)
cursor
=
0
for
line
in
mecab_output
.
split
(
'
\n
'
):
if
line
==
'EOS'
:
break
token
,
_
=
line
.
split
(
'
\t
'
)
token_start
=
text
.
index
(
token
,
cursor
)
token_end
=
token_start
+
len
(
token
)
if
self
.
do_lower_case
and
token
not
in
never_split
:
token
=
token
.
lower
()
tokens
.
append
(
token
)
cursor
=
token_end
return
tokens
class
CharacterTokenizer
(
object
):
"""Runs Character tokenziation."""
def
__init__
(
self
,
vocab
,
unk_token
,
normalize_text
=
True
):
"""Constructs a CharacterTokenizer.
Args:
**vocab**:
Vocabulary object.
**unk_token**: str
A special symbol for out-of-vocabulary token.
**normalize_text**: (`optional`) boolean (default True)
Whether to apply unicode normalization to text before tokenization.
"""
self
.
vocab
=
vocab
self
.
unk_token
=
unk_token
self
.
normalize_text
=
normalize_text
def
tokenize
(
self
,
text
):
"""Tokenizes a piece of text into characters.
For example:
input = "apple"
output = ["a", "p", "p", "l", "e"]
Args:
text: A single token or whitespace separated tokens.
This should have already been passed through `BasicTokenizer`.
Returns:
A list of characters.
"""
if
self
.
normalize_text
:
text
=
unicodedata
.
normalize
(
'NFKC'
,
text
)
output_tokens
=
[]
for
i
,
char
in
enumerate
(
text
):
if
char
not
in
self
.
vocab
:
output_tokens
.
append
(
self
.
unk_token
)
continue
output_tokens
.
append
(
char
)
return
output_tokens
transformers/tokenization_utils.py
View file @
386a93f0
...
...
@@ -923,7 +923,7 @@ class PreTrainedTokenizer(object):
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
or PyTorch torch.Tensor instead of a list of python integers.
return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
return_attention_mask: (optional) Set to False to avoi
r
returning attention mask (default True)
return_attention_mask: (optional) Set to False to avoi
d
returning attention mask (default True)
return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).
...
...
@@ -968,24 +968,13 @@ class PreTrainedTokenizer(object):
if
add_special_tokens
:
sequence
=
self
.
build_inputs_with_special_tokens
(
ids
,
pair_ids
)
token_type_ids
=
self
.
create_token_type_ids_from_sequences
(
ids
,
pair_ids
)
special_tokens_mask
=
self
.
get_special_tokens_mask
(
ids
,
pair_ids
)
else
:
sequence
=
ids
+
pair_ids
if
pair
else
ids
token_type_ids
=
[
0
]
*
len
(
ids
)
+
([
1
]
*
len
(
pair_ids
)
if
pair
else
[])
special_tokens_mask
=
[
0
]
*
(
len
(
ids
)
+
(
len
(
pair_ids
)
if
pair
else
0
))
if
return_special_tokens_mask
:
encoded_inputs
[
"special_tokens_mask"
]
=
self
.
get_special_tokens_mask
(
ids
,
pair_ids
)
# Prepare inputs as tensors if asked
if
return_tensors
==
'tf'
and
is_tf_available
():
sequence
=
tf
.
constant
([
sequence
])
token_type_ids
=
tf
.
constant
([
token_type_ids
])
elif
return_tensors
==
'pt'
and
is_torch_available
():
sequence
=
torch
.
tensor
([
sequence
])
token_type_ids
=
torch
.
tensor
([
token_type_ids
])
elif
return_tensors
is
not
None
:
logger
.
warning
(
"Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available."
.
format
(
return_tensors
))
encoded_inputs
[
"input_ids"
]
=
sequence
if
return_token_type_ids
:
encoded_inputs
[
"token_type_ids"
]
=
token_type_ids
...
...
@@ -1022,7 +1011,6 @@ class PreTrainedTokenizer(object):
if
return_special_tokens_mask
:
encoded_inputs
[
"special_tokens_mask"
]
=
encoded_inputs
[
"special_tokens_mask"
]
+
[
1
]
*
difference
encoded_inputs
[
"input_ids"
]
=
encoded_inputs
[
"input_ids"
]
+
[
self
.
pad_token_id
]
*
difference
elif
self
.
padding_side
==
'left'
:
if
return_attention_mask
:
encoded_inputs
[
"attention_mask"
]
=
[
0
]
*
difference
+
[
1
]
*
len
(
encoded_inputs
[
"input_ids"
])
...
...
@@ -1038,6 +1026,25 @@ class PreTrainedTokenizer(object):
elif
return_attention_mask
:
encoded_inputs
[
"attention_mask"
]
=
[
1
]
*
len
(
encoded_inputs
[
"input_ids"
])
# Prepare inputs as tensors if asked
if
return_tensors
==
'tf'
and
is_tf_available
():
encoded_inputs
[
"input_ids"
]
=
tf
.
constant
([
encoded_inputs
[
"input_ids"
]])
encoded_inputs
[
"token_type_ids"
]
=
tf
.
constant
([
encoded_inputs
[
"token_type_ids"
]])
if
"attention_mask"
in
encoded_inputs
:
encoded_inputs
[
"attention_mask"
]
=
tf
.
constant
([
encoded_inputs
[
"attention_mask"
]])
elif
return_tensors
==
'pt'
and
is_torch_available
():
encoded_inputs
[
"input_ids"
]
=
torch
.
tensor
([
encoded_inputs
[
"input_ids"
]])
encoded_inputs
[
"token_type_ids"
]
=
torch
.
tensor
([
encoded_inputs
[
"token_type_ids"
]])
if
"attention_mask"
in
encoded_inputs
:
encoded_inputs
[
"attention_mask"
]
=
torch
.
tensor
([
encoded_inputs
[
"attention_mask"
]])
elif
return_tensors
is
not
None
:
logger
.
warning
(
"Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available."
.
format
(
return_tensors
))
return
encoded_inputs
def
truncate_sequences
(
self
,
ids
,
pair_ids
=
None
,
num_tokens_to_remove
=
0
,
truncation_strategy
=
'longest_first'
,
stride
=
0
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment