Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
c03c0dfd
Commit
c03c0dfd
authored
Nov 15, 2019
by
Masatoshi Suzuki
Committed by
Julien Chaumond
Dec 11, 2019
Browse files
Add support for Japanese BERT models by cl-tohoku
parent
030faccb
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
289 additions
and
8 deletions
+289
-8
docs/source/pretrained_models.rst
docs/source/pretrained_models.rst
+18
-0
transformers/__init__.py
transformers/__init__.py
+1
-0
transformers/configuration_bert.py
transformers/configuration_bert.py
+4
-0
transformers/modeling_bert.py
transformers/modeling_bert.py
+6
-2
transformers/modeling_tf_bert.py
transformers/modeling_tf_bert.py
+10
-6
transformers/tokenization_auto.py
transformers/tokenization_auto.py
+3
-0
transformers/tokenization_bert_japanese.py
transformers/tokenization_bert_japanese.py
+247
-0
No files found.
docs/source/pretrained_models.rst
View file @
c03c0dfd
...
@@ -61,6 +61,24 @@ Here is the full list of the currently provided pretrained models together with
...
@@ -61,6 +61,24 @@ Here is the full list of the currently provided pretrained models together with
| | ``bert-base-german-dbmdz-uncased`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | ``bert-base-german-dbmdz-uncased`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | Trained on uncased German text by DBMDZ |
| | | | Trained on uncased German text by DBMDZ |
| | | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__). |
| | | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__). |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``bert-base-japanese`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece. |
| | | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization. |
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``bert-base-japanese-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece. |
| | | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization. |
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``bert-base-japanese-char`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | Trained on Japanese text. Text is tokenized into characters. |
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``bert-base-japanese-char-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters. |
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| GPT | ``openai-gpt`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| GPT | ``openai-gpt`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | OpenAI GPT English model |
| | | | OpenAI GPT English model |
...
...
transformers/__init__.py
View file @
c03c0dfd
...
@@ -37,6 +37,7 @@ if is_sklearn_available():
...
@@ -37,6 +37,7 @@ if is_sklearn_available():
from
.tokenization_utils
import
(
PreTrainedTokenizer
)
from
.tokenization_utils
import
(
PreTrainedTokenizer
)
from
.tokenization_auto
import
AutoTokenizer
from
.tokenization_auto
import
AutoTokenizer
from
.tokenization_bert
import
BertTokenizer
,
BasicTokenizer
,
WordpieceTokenizer
from
.tokenization_bert
import
BertTokenizer
,
BasicTokenizer
,
WordpieceTokenizer
from
.tokenization_bert_japanese
import
BertJapaneseTokenizer
,
MecabTokenizer
,
CharacterTokenizer
from
.tokenization_openai
import
OpenAIGPTTokenizer
from
.tokenization_openai
import
OpenAIGPTTokenizer
from
.tokenization_transfo_xl
import
(
TransfoXLTokenizer
,
TransfoXLCorpus
)
from
.tokenization_transfo_xl
import
(
TransfoXLTokenizer
,
TransfoXLCorpus
)
from
.tokenization_gpt2
import
GPT2Tokenizer
from
.tokenization_gpt2
import
GPT2Tokenizer
...
...
transformers/configuration_bert.py
View file @
c03c0dfd
...
@@ -42,6 +42,10 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
...
@@ -42,6 +42,10 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
'bert-base-cased-finetuned-mrpc'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json"
,
'bert-base-cased-finetuned-mrpc'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json"
,
'bert-base-german-dbmdz-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json"
,
'bert-base-german-dbmdz-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json"
,
'bert-base-german-dbmdz-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json"
,
'bert-base-german-dbmdz-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json"
,
'bert-base-japanese'
:
"https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-config.json"
,
'bert-base-japanese-whole-word-masking'
:
"https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-config.json"
,
'bert-base-japanese-char'
:
"https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-config.json"
,
'bert-base-japanese-char-whole-word-masking'
:
"https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-config.json"
}
}
...
...
transformers/modeling_bert.py
View file @
c03c0dfd
...
@@ -48,6 +48,10 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
...
@@ -48,6 +48,10 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
'bert-base-cased-finetuned-mrpc'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin"
,
'bert-base-cased-finetuned-mrpc'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin"
,
'bert-base-german-dbmdz-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin"
,
'bert-base-german-dbmdz-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin"
,
'bert-base-german-dbmdz-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin"
,
'bert-base-german-dbmdz-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin"
,
'bert-base-japanese'
:
"https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-pytorch_model.bin"
,
'bert-base-japanese-whole-word-masking'
:
"https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-pytorch_model.bin"
,
'bert-base-japanese-char'
:
"https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-pytorch_model.bin"
,
'bert-base-japanese-char-whole-word-masking'
:
"https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-pytorch_model.bin"
}
}
...
@@ -1233,9 +1237,9 @@ class BertForQuestionAnswering(BertPreTrainedModel):
...
@@ -1233,9 +1237,9 @@ class BertForQuestionAnswering(BertPreTrainedModel):
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
input_ids = tokenizer.encode(input_text)
input_ids = tokenizer.encode(input_text)
token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
# a nice puppet
# a nice puppet
...
...
transformers/modeling_tf_bert.py
View file @
c03c0dfd
...
@@ -48,6 +48,10 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
...
@@ -48,6 +48,10 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
'bert-large-uncased-whole-word-masking-finetuned-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5"
,
'bert-large-uncased-whole-word-masking-finetuned-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5"
,
'bert-large-cased-whole-word-masking-finetuned-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5"
,
'bert-large-cased-whole-word-masking-finetuned-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5"
,
'bert-base-cased-finetuned-mrpc'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5"
,
'bert-base-cased-finetuned-mrpc'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5"
,
'bert-base-japanese'
:
"https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-tf_model.h5"
,
'bert-base-japanese-whole-word-masking'
:
"https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-tf_model.h5"
,
'bert-base-japanese-char'
:
"https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-tf_model.h5"
,
'bert-base-japanese-char-whole-word-masking'
:
"https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-tf_model.h5"
}
}
...
@@ -129,7 +133,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
...
@@ -129,7 +133,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
linear tensor, float32 with shape [batch_size, length, vocab_size].
linear tensor, float32 with shape [batch_size, length, vocab_size].
Raises:
Raises:
ValueError: if mode is not valid.
ValueError: if mode is not valid.
Shared weights logic adapted from
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
"""
...
@@ -148,7 +152,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
...
@@ -148,7 +152,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
input_shape
=
shape_list
(
input_ids
)
input_shape
=
shape_list
(
input_ids
)
else
:
else
:
input_shape
=
shape_list
(
inputs_embeds
)[:
-
1
]
input_shape
=
shape_list
(
inputs_embeds
)[:
-
1
]
seq_length
=
input_shape
[
1
]
seq_length
=
input_shape
[
1
]
if
position_ids
is
None
:
if
position_ids
is
None
:
position_ids
=
tf
.
range
(
seq_length
,
dtype
=
tf
.
int32
)[
tf
.
newaxis
,
:]
position_ids
=
tf
.
range
(
seq_length
,
dtype
=
tf
.
int32
)[
tf
.
newaxis
,
:]
...
@@ -246,7 +250,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
...
@@ -246,7 +250,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
context_layer
=
tf
.
matmul
(
attention_probs
,
value_layer
)
context_layer
=
tf
.
matmul
(
attention_probs
,
value_layer
)
context_layer
=
tf
.
transpose
(
context_layer
,
perm
=
[
0
,
2
,
1
,
3
])
context_layer
=
tf
.
transpose
(
context_layer
,
perm
=
[
0
,
2
,
1
,
3
])
context_layer
=
tf
.
reshape
(
context_layer
,
context_layer
=
tf
.
reshape
(
context_layer
,
(
batch_size
,
-
1
,
self
.
all_head_size
))
# (batch_size, seq_len_q, all_head_size)
(
batch_size
,
-
1
,
self
.
all_head_size
))
# (batch_size, seq_len_q, all_head_size)
outputs
=
(
context_layer
,
attention_probs
)
if
self
.
output_attentions
else
(
context_layer
,)
outputs
=
(
context_layer
,
attention_probs
)
if
self
.
output_attentions
else
(
context_layer
,)
...
@@ -591,7 +595,7 @@ BERT_START_DOCSTRING = r""" The BERT model was proposed in
...
@@ -591,7 +595,7 @@ BERT_START_DOCSTRING = r""" The BERT model was proposed in
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
Parameters:
Parameters:
config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
"""
"""
...
@@ -605,13 +609,13 @@ BERT_INPUTS_DOCSTRING = r"""
...
@@ -605,13 +609,13 @@ BERT_INPUTS_DOCSTRING = r"""
(a) For sequence pairs:
(a) For sequence pairs:
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
(b) For single sequences:
(b) For single sequences:
``tokens: [CLS] the dog is hairy . [SEP]``
``tokens: [CLS] the dog is hairy . [SEP]``
``token_type_ids: 0 0 0 0 0 0 0``
``token_type_ids: 0 0 0 0 0 0 0``
Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
...
...
transformers/tokenization_auto.py
View file @
c03c0dfd
...
@@ -19,6 +19,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
...
@@ -19,6 +19,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
logging
import
logging
from
.tokenization_bert
import
BertTokenizer
from
.tokenization_bert
import
BertTokenizer
from
.tokenization_bert_japanese
import
BertJapaneseTokenizer
from
.tokenization_openai
import
OpenAIGPTTokenizer
from
.tokenization_openai
import
OpenAIGPTTokenizer
from
.tokenization_gpt2
import
GPT2Tokenizer
from
.tokenization_gpt2
import
GPT2Tokenizer
from
.tokenization_ctrl
import
CTRLTokenizer
from
.tokenization_ctrl
import
CTRLTokenizer
...
@@ -118,6 +119,8 @@ class AutoTokenizer(object):
...
@@ -118,6 +119,8 @@ class AutoTokenizer(object):
return
CamembertTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
return
CamembertTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'roberta'
in
pretrained_model_name_or_path
:
elif
'roberta'
in
pretrained_model_name_or_path
:
return
RobertaTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
return
RobertaTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'bert-japanese'
in
pretrained_model_name_or_path
:
return
BertJapaneseTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'bert'
in
pretrained_model_name_or_path
:
elif
'bert'
in
pretrained_model_name_or_path
:
return
BertTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
return
BertTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'openai-gpt'
in
pretrained_model_name_or_path
:
elif
'openai-gpt'
in
pretrained_model_name_or_path
:
...
...
transformers/tokenization_bert_japanese.py
0 → 100644
View file @
c03c0dfd
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
collections
import
logging
import
os
import
unicodedata
from
io
import
open
from
.tokenization_bert
import
BertTokenizer
,
BasicTokenizer
,
WordpieceTokenizer
,
load_vocab
from
.tokenization_utils
import
PreTrainedTokenizer
logger
=
logging
.
getLogger
(
__name__
)
VOCAB_FILES_NAMES
=
{
'vocab_file'
:
'vocab.txt'
}
PRETRAINED_VOCAB_FILES_MAP
=
{
'vocab_file'
:
{
'bert-base-japanese'
:
"https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-vocab.txt"
,
'bert-base-japanese-whole-word-masking'
:
"https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-vocab.txt"
,
'bert-base-japanese-char'
:
"https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-vocab.txt"
,
'bert-base-japanese-char-whole-word-masking'
:
"https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-vocab.txt"
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
'bert-base-japanese'
:
512
,
'bert-base-japanese-whole-word-masking'
:
512
,
'bert-base-japanese-char'
:
512
,
'bert-base-japanese-char-whole-word-masking'
:
512
}
PRETRAINED_INIT_CONFIGURATION
=
{
'bert-base-japanese'
:
{
'do_lower_case'
:
False
,
'word_tokenizer_type'
:
'mecab'
,
'subword_tokenizer_type'
:
'wordpiece'
},
'bert-base-japanese-whole-word-masking'
:{
'do_lower_case'
:
False
,
'word_tokenizer_type'
:
'mecab'
,
'subword_tokenizer_type'
:
'wordpiece'
},
'bert-base-japanese-char'
:
{
'do_lower_case'
:
False
,
'word_tokenizer_type'
:
'mecab'
,
'subword_tokenizer_type'
:
'character'
},
'bert-base-japanese-char-whole-word-masking'
:
{
'do_lower_case'
:
False
,
'word_tokenizer_type'
:
'mecab'
,
'subword_tokenizer_type'
:
'character'
}
}
class
BertJapaneseTokenizer
(
BertTokenizer
):
"""BERT tokenizer for Japanese text"""
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration
=
PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def
__init__
(
self
,
vocab_file
,
do_lower_case
=
False
,
do_word_tokenize
=
True
,
do_subword_tokenize
=
True
,
word_tokenizer_type
=
'basic'
,
subword_tokenizer_type
=
'wordpiece'
,
never_split
=
None
,
unk_token
=
'[UNK]'
,
sep_token
=
'[SEP]'
,
pad_token
=
'[PAD]'
,
cls_token
=
'[CLS]'
,
mask_token
=
'[MASK]'
,
**
kwargs
):
"""Constructs a MecabBertTokenizer.
Args:
**vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
**do_lower_case**: (`optional`) boolean (default True)
Whether to lower case the input.
Only has an effect when do_basic_tokenize=True.
**do_word_tokenize**: (`optional`) boolean (default True)
Whether to do word tokenization.
**do_subword_tokenize**: (`optional`) boolean (default True)
Whether to do subword tokenization.
**word_tokenizer_type**: (`optional`) string (default "basic")
Type of word tokenizer.
**subword_tokenizer_type**: (`optional`) string (default "wordpiece")
Type of subword tokenizer.
"""
super
(
BertTokenizer
,
self
).
__init__
(
unk_token
=
unk_token
,
sep_token
=
sep_token
,
pad_token
=
pad_token
,
cls_token
=
cls_token
,
mask_token
=
mask_token
,
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
3
# take into account special tokens
if
not
os
.
path
.
isfile
(
vocab_file
):
raise
ValueError
(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
.
format
(
vocab_file
))
self
.
vocab
=
load_vocab
(
vocab_file
)
self
.
ids_to_tokens
=
collections
.
OrderedDict
(
[(
ids
,
tok
)
for
tok
,
ids
in
self
.
vocab
.
items
()])
self
.
do_word_tokenize
=
do_word_tokenize
if
do_word_tokenize
:
if
word_tokenizer_type
==
'basic'
:
self
.
word_tokenizer
=
BasicTokenizer
(
do_lower_case
=
do_lower_case
,
never_split
=
never_split
,
tokenize_chinese_chars
=
False
)
elif
word_tokenizer_type
==
'mecab'
:
self
.
word_tokenizer
=
MecabTokenizer
(
do_lower_case
=
do_lower_case
,
never_split
=
never_split
)
else
:
raise
ValueError
(
"Invalid word_tokenizer_type '{}' is specified."
.
format
(
word_tokenizer_type
))
self
.
do_subword_tokenize
=
do_subword_tokenize
if
do_subword_tokenize
:
if
subword_tokenizer_type
==
'wordpiece'
:
self
.
subword_tokenizer
=
WordpieceTokenizer
(
vocab
=
self
.
vocab
,
unk_token
=
self
.
unk_token
)
elif
subword_tokenizer_type
==
'character'
:
self
.
subword_tokenizer
=
CharacterTokenizer
(
vocab
=
self
.
vocab
,
unk_token
=
self
.
unk_token
)
else
:
raise
ValueError
(
"Invalid subword_tokenizer_type '{}' is specified."
.
format
(
subword_tokenizer_type
))
def
_tokenize
(
self
,
text
):
if
self
.
do_word_tokenize
:
tokens
=
self
.
word_tokenizer
.
tokenize
(
text
,
never_split
=
self
.
all_special_tokens
)
else
:
tokens
=
[
text
]
if
self
.
do_subword_tokenize
:
split_tokens
=
[
sub_token
for
token
in
tokens
for
sub_token
in
self
.
subword_tokenizer
.
tokenize
(
token
)]
else
:
split_tokens
=
tokens
return
split_tokens
class
MecabTokenizer
(
object
):
"""Runs basic tokenization with MeCab morphological parser."""
def
__init__
(
self
,
do_lower_case
=
False
,
never_split
=
None
,
normalize_text
=
True
):
"""Constructs a MecabTokenizer.
Args:
**do_lower_case**: (`optional`) boolean (default True)
Whether to lower case the input.
**never_split**: (`optional`) list of str
Kept for backward compatibility purposes.
Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
List of token not to split.
**normalize_text**: (`optional`) boolean (default True)
Whether to apply unicode normalization to text before tokenization.
"""
self
.
do_lower_case
=
do_lower_case
self
.
never_split
=
never_split
if
never_split
is
not
None
else
[]
self
.
normalize_text
=
normalize_text
import
MeCab
self
.
mecab
=
MeCab
.
Tagger
()
def
tokenize
(
self
,
text
,
never_split
=
None
,
**
kwargs
):
"""Tokenizes a piece of text."""
if
self
.
normalize_text
:
text
=
unicodedata
.
normalize
(
'NFKC'
,
text
)
never_split
=
self
.
never_split
+
(
never_split
if
never_split
is
not
None
else
[])
tokens
=
[]
cursor
=
0
for
line
in
self
.
mecab
.
parse
(
text
).
split
(
'
\n
'
):
if
line
==
'EOS'
:
break
token
,
_
=
line
.
split
(
'
\t
'
)
token_start
=
text
.
index
(
token
,
cursor
)
token_end
=
token_start
+
len
(
token
)
if
self
.
do_lower_case
and
token
not
in
never_split
:
token
=
token
.
lower
()
tokens
.
append
(
token
)
cursor
=
token_end
return
tokens
class
CharacterTokenizer
(
object
):
"""Runs Character tokenziation."""
def
__init__
(
self
,
vocab
,
unk_token
,
normalize_text
=
True
):
"""Constructs a CharacterTokenizer.
Args:
**vocab**:
Vocabulary object.
**unk_token**: str
A special symbol for out-of-vocabulary token.
**normalize_text**: (`optional`) boolean (default True)
Whether to apply unicode normalization to text before tokenization.
"""
self
.
vocab
=
vocab
self
.
unk_token
=
unk_token
self
.
normalize_text
=
normalize_text
def
tokenize
(
self
,
text
):
"""Tokenizes a piece of text into characters.
For example:
input = "apple"
output = ["a", "p", "p", "l", "e"]
Args:
text: A single token or whitespace separated tokens.
This should have already been passed through `BasicTokenizer`.
Returns:
A list of characters.
"""
if
self
.
normalize_text
:
text
=
unicodedata
.
normalize
(
'NFKC'
,
text
)
output_tokens
=
[]
for
i
,
char
in
enumerate
(
text
):
if
char
not
in
self
.
vocab
:
output_tokens
.
append
(
self
.
unk_token
)
continue
output_tokens
.
append
(
char
)
return
output_tokens
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment