Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
386a93f0
Unverified
Commit
386a93f0
authored
Dec 12, 2019
by
Thomas Wolf
Committed by
GitHub
Dec 12, 2019
Browse files
Merge branch 'master' into from-pretrained-from-url
parents
4f15e5a2
2d103546
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
572 additions
and
37 deletions
+572
-37
.circleci/config.yml
.circleci/config.yml
+23
-0
docs/source/pretrained_models.rst
docs/source/pretrained_models.rst
+26
-8
transformers/__init__.py
transformers/__init__.py
+1
-0
transformers/configuration_bert.py
transformers/configuration_bert.py
+4
-0
transformers/modeling_bert.py
transformers/modeling_bert.py
+6
-2
transformers/modeling_tf_bert.py
transformers/modeling_tf_bert.py
+10
-6
transformers/tests/tokenization_bert_japanese_test.py
transformers/tests/tokenization_bert_japanese_test.py
+191
-0
transformers/tests/utils.py
transformers/tests/utils.py
+31
-5
transformers/tokenization_auto.py
transformers/tokenization_auto.py
+4
-0
transformers/tokenization_bert_japanese.py
transformers/tokenization_bert_japanese.py
+253
-0
transformers/tokenization_utils.py
transformers/tokenization_utils.py
+23
-16
No files found.
.circleci/config.yml
View file @
386a93f0
...
@@ -70,6 +70,27 @@ jobs:
...
@@ -70,6 +70,27 @@ jobs:
-
run
:
sudo pip install pytest codecov pytest-cov
-
run
:
sudo pip install pytest codecov pytest-cov
-
run
:
python -m pytest -sv ./transformers/tests/ --cov
-
run
:
python -m pytest -sv ./transformers/tests/ --cov
-
run
:
codecov
-
run
:
codecov
build_py3_custom_tokenizers
:
working_directory
:
~/transformers
docker
:
-
image
:
circleci/python:3.5
steps
:
-
checkout
-
run
:
sudo pip install --progress-bar off .
-
run
:
sudo pip install pytest
-
run
:
sudo pip install mecab-python3
-
run
:
RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py
build_py2_custom_tokenizers
:
working_directory
:
~/transformers
docker
:
-
image
:
circleci/python:2.7
steps
:
-
checkout
-
run
:
sudo pip install --progress-bar off .
-
run
:
sudo pip install pytest
-
run
:
sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
-
run
:
sudo pip install mecab-python
-
run
:
RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py
deploy_doc
:
deploy_doc
:
working_directory
:
~/transformers
working_directory
:
~/transformers
docker
:
docker
:
...
@@ -91,6 +112,8 @@ workflows:
...
@@ -91,6 +112,8 @@ workflows:
version
:
2
version
:
2
build_and_test
:
build_and_test
:
jobs
:
jobs
:
-
build_py3_custom_tokenizers
-
build_py2_custom_tokenizers
-
build_py3_torch_and_tf
-
build_py3_torch_and_tf
-
build_py3_torch
-
build_py3_torch
-
build_py3_tf
-
build_py3_tf
...
...
docs/source/pretrained_models.rst
View file @
386a93f0
...
@@ -61,6 +61,24 @@ Here is the full list of the currently provided pretrained models together with
...
@@ -61,6 +61,24 @@ Here is the full list of the currently provided pretrained models together with
| | ``bert-base-german-dbmdz-uncased`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | ``bert-base-german-dbmdz-uncased`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | Trained on uncased German text by DBMDZ |
| | | | Trained on uncased German text by DBMDZ |
| | | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__). |
| | | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__). |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``bert-base-japanese`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece. |
| | | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization. |
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``bert-base-japanese-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece. |
| | | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization. |
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``bert-base-japanese-char`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | Trained on Japanese text. Text is tokenized into characters. |
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``bert-base-japanese-char-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters. |
| | | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__). |
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| GPT | ``openai-gpt`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| GPT | ``openai-gpt`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. |
| | | | OpenAI GPT English model |
| | | | OpenAI GPT English model |
...
@@ -169,35 +187,35 @@ Here is the full list of the currently provided pretrained models together with
...
@@ -169,35 +187,35 @@ Here is the full list of the currently provided pretrained models together with
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| ALBERT | ``albert-base-v1`` | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters |
| ALBERT | ``albert-base-v1`` | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters |
| | | | ALBERT base model |
| | | | ALBERT base model |
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
| | | (see `details <https://github.com/google-research/ALBERT>`__)
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``albert-large-v1`` | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters |
| | ``albert-large-v1`` | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters |
| | | | ALBERT large model |
| | | | ALBERT large model |
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
| | | (see `details <https://github.com/google-research/ALBERT>`__)
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``albert-xlarge-v1`` | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters |
| | ``albert-xlarge-v1`` | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters |
| | | | ALBERT xlarge model |
| | | | ALBERT xlarge model |
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
| | | (see `details <https://github.com/google-research/ALBERT>`__)
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``albert-xxlarge-v1`` | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters |
| | ``albert-xxlarge-v1`` | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters |
| | | | ALBERT xxlarge model |
| | | | ALBERT xxlarge model |
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
| | | (see `details <https://github.com/google-research/ALBERT>`__)
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``albert-base-v2`` | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters |
| | ``albert-base-v2`` | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters |
| | | | ALBERT base model with no dropout, additional training data and longer training |
| | | | ALBERT base model with no dropout, additional training data and longer training |
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
| | | (see `details <https://github.com/google-research/ALBERT>`__)
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``albert-large-v2`` | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters |
| | ``albert-large-v2`` | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters |
| | | | ALBERT large model with no dropout, additional training data and longer training |
| | | | ALBERT large model with no dropout, additional training data and longer training |
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
| | | (see `details <https://github.com/google-research/ALBERT>`__)
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``albert-xlarge-v2`` | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters |
| | ``albert-xlarge-v2`` | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters |
| | | | ALBERT xlarge model with no dropout, additional training data and longer training |
| | | | ALBERT xlarge model with no dropout, additional training data and longer training |
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
| | | (see `details <https://github.com/google-research/ALBERT>`__)
|
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``albert-xxlarge-v2`` | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters |
| | ``albert-xxlarge-v2`` | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters |
| | | | ALBERT xxlarge model with no dropout, additional training data and longer training |
| | | | ALBERT xxlarge model with no dropout, additional training data and longer training |
| | | (see `details <https://github.com/google-research/ALBERT>`__) |
| | | (see `details <https://github.com/google-research/ALBERT>`__)
|
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
...
...
transformers/__init__.py
View file @
386a93f0
...
@@ -37,6 +37,7 @@ if is_sklearn_available():
...
@@ -37,6 +37,7 @@ if is_sklearn_available():
from
.tokenization_utils
import
(
PreTrainedTokenizer
)
from
.tokenization_utils
import
(
PreTrainedTokenizer
)
from
.tokenization_auto
import
AutoTokenizer
from
.tokenization_auto
import
AutoTokenizer
from
.tokenization_bert
import
BertTokenizer
,
BasicTokenizer
,
WordpieceTokenizer
from
.tokenization_bert
import
BertTokenizer
,
BasicTokenizer
,
WordpieceTokenizer
from
.tokenization_bert_japanese
import
BertJapaneseTokenizer
,
MecabTokenizer
,
CharacterTokenizer
from
.tokenization_openai
import
OpenAIGPTTokenizer
from
.tokenization_openai
import
OpenAIGPTTokenizer
from
.tokenization_transfo_xl
import
(
TransfoXLTokenizer
,
TransfoXLCorpus
)
from
.tokenization_transfo_xl
import
(
TransfoXLTokenizer
,
TransfoXLCorpus
)
from
.tokenization_gpt2
import
GPT2Tokenizer
from
.tokenization_gpt2
import
GPT2Tokenizer
...
...
transformers/configuration_bert.py
View file @
386a93f0
...
@@ -42,6 +42,10 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
...
@@ -42,6 +42,10 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
'bert-base-cased-finetuned-mrpc'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json"
,
'bert-base-cased-finetuned-mrpc'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json"
,
'bert-base-german-dbmdz-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json"
,
'bert-base-german-dbmdz-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json"
,
'bert-base-german-dbmdz-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json"
,
'bert-base-german-dbmdz-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json"
,
'bert-base-japanese'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json"
,
'bert-base-japanese-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json"
,
'bert-base-japanese-char'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json"
,
'bert-base-japanese-char-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json"
}
}
...
...
transformers/modeling_bert.py
View file @
386a93f0
...
@@ -48,6 +48,10 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
...
@@ -48,6 +48,10 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
'bert-base-cased-finetuned-mrpc'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin"
,
'bert-base-cased-finetuned-mrpc'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin"
,
'bert-base-german-dbmdz-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin"
,
'bert-base-german-dbmdz-cased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin"
,
'bert-base-german-dbmdz-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin"
,
'bert-base-german-dbmdz-uncased'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin"
,
'bert-base-japanese'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin"
,
'bert-base-japanese-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin"
,
'bert-base-japanese-char'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin"
,
'bert-base-japanese-char-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin"
}
}
...
@@ -1233,9 +1237,9 @@ class BertForQuestionAnswering(BertPreTrainedModel):
...
@@ -1233,9 +1237,9 @@ class BertForQuestionAnswering(BertPreTrainedModel):
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
input_ids = tokenizer.encode(input_text)
input_ids = tokenizer.encode(input_text)
token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
# a nice puppet
# a nice puppet
...
...
transformers/modeling_tf_bert.py
View file @
386a93f0
...
@@ -48,6 +48,10 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
...
@@ -48,6 +48,10 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
'bert-large-uncased-whole-word-masking-finetuned-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5"
,
'bert-large-uncased-whole-word-masking-finetuned-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5"
,
'bert-large-cased-whole-word-masking-finetuned-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5"
,
'bert-large-cased-whole-word-masking-finetuned-squad'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5"
,
'bert-base-cased-finetuned-mrpc'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5"
,
'bert-base-cased-finetuned-mrpc'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5"
,
'bert-base-japanese'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5"
,
'bert-base-japanese-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5"
,
'bert-base-japanese-char'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5"
,
'bert-base-japanese-char-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5"
}
}
...
@@ -129,7 +133,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
...
@@ -129,7 +133,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
linear tensor, float32 with shape [batch_size, length, vocab_size].
linear tensor, float32 with shape [batch_size, length, vocab_size].
Raises:
Raises:
ValueError: if mode is not valid.
ValueError: if mode is not valid.
Shared weights logic adapted from
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
"""
...
@@ -148,7 +152,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
...
@@ -148,7 +152,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
input_shape
=
shape_list
(
input_ids
)
input_shape
=
shape_list
(
input_ids
)
else
:
else
:
input_shape
=
shape_list
(
inputs_embeds
)[:
-
1
]
input_shape
=
shape_list
(
inputs_embeds
)[:
-
1
]
seq_length
=
input_shape
[
1
]
seq_length
=
input_shape
[
1
]
if
position_ids
is
None
:
if
position_ids
is
None
:
position_ids
=
tf
.
range
(
seq_length
,
dtype
=
tf
.
int32
)[
tf
.
newaxis
,
:]
position_ids
=
tf
.
range
(
seq_length
,
dtype
=
tf
.
int32
)[
tf
.
newaxis
,
:]
...
@@ -246,7 +250,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
...
@@ -246,7 +250,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
context_layer
=
tf
.
matmul
(
attention_probs
,
value_layer
)
context_layer
=
tf
.
matmul
(
attention_probs
,
value_layer
)
context_layer
=
tf
.
transpose
(
context_layer
,
perm
=
[
0
,
2
,
1
,
3
])
context_layer
=
tf
.
transpose
(
context_layer
,
perm
=
[
0
,
2
,
1
,
3
])
context_layer
=
tf
.
reshape
(
context_layer
,
context_layer
=
tf
.
reshape
(
context_layer
,
(
batch_size
,
-
1
,
self
.
all_head_size
))
# (batch_size, seq_len_q, all_head_size)
(
batch_size
,
-
1
,
self
.
all_head_size
))
# (batch_size, seq_len_q, all_head_size)
outputs
=
(
context_layer
,
attention_probs
)
if
self
.
output_attentions
else
(
context_layer
,)
outputs
=
(
context_layer
,
attention_probs
)
if
self
.
output_attentions
else
(
context_layer
,)
...
@@ -591,7 +595,7 @@ BERT_START_DOCSTRING = r""" The BERT model was proposed in
...
@@ -591,7 +595,7 @@ BERT_START_DOCSTRING = r""" The BERT model was proposed in
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
Parameters:
Parameters:
config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
"""
"""
...
@@ -605,13 +609,13 @@ BERT_INPUTS_DOCSTRING = r"""
...
@@ -605,13 +609,13 @@ BERT_INPUTS_DOCSTRING = r"""
(a) For sequence pairs:
(a) For sequence pairs:
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
(b) For single sequences:
(b) For single sequences:
``tokens: [CLS] the dog is hairy . [SEP]``
``tokens: [CLS] the dog is hairy . [SEP]``
``token_type_ids: 0 0 0 0 0 0 0``
``token_type_ids: 0 0 0 0 0 0 0``
Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
...
...
transformers/tests/tokenization_bert_japanese_test.py
0 → 100644
View file @
386a93f0
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
os
import
unittest
from
io
import
open
from
transformers.tokenization_bert
import
WordpieceTokenizer
from
transformers.tokenization_bert_japanese
import
(
BertJapaneseTokenizer
,
MecabTokenizer
,
CharacterTokenizer
,
VOCAB_FILES_NAMES
)
from
.tokenization_tests_commons
import
CommonTestCases
from
.utils
import
slow
,
custom_tokenizers
@
custom_tokenizers
class
BertJapaneseTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
tokenizer_class
=
BertJapaneseTokenizer
def
setUp
(
self
):
super
(
BertJapaneseTokenizationTest
,
self
).
setUp
()
vocab_tokens
=
[
u
"[UNK]"
,
u
"[CLS]"
,
u
"[SEP]"
,
u
"こんにちは"
,
u
"こん"
,
u
"にちは"
,
u
"ばんは"
,
u
"##こん"
,
u
"##にちは"
,
u
"##ばんは"
,
u
"世界"
,
u
"##世界"
,
u
"、"
,
u
"##、"
,
u
"。"
,
u
"##。"
]
self
.
vocab_file
=
os
.
path
.
join
(
self
.
tmpdirname
,
VOCAB_FILES_NAMES
[
"vocab_file"
])
with
open
(
self
.
vocab_file
,
"w"
,
encoding
=
"utf-8"
)
as
vocab_writer
:
vocab_writer
.
write
(
""
.
join
([
x
+
"
\n
"
for
x
in
vocab_tokens
]))
def
get_tokenizer
(
self
,
**
kwargs
):
return
BertJapaneseTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
kwargs
)
def
get_input_output_texts
(
self
):
input_text
=
u
"こんにちは、世界。
\n
こんばんは、世界。"
output_text
=
u
"こんにちは 、 世界 。 こんばんは 、 世界 。"
return
input_text
,
output_text
def
test_full_tokenizer
(
self
):
tokenizer
=
self
.
tokenizer_class
(
self
.
vocab_file
)
tokens
=
tokenizer
.
tokenize
(
u
"こんにちは、世界。
\n
こんばんは、世界。"
)
self
.
assertListEqual
(
tokens
,
[
u
"こんにちは"
,
u
"、"
,
u
"世界"
,
u
"。"
,
u
"こん"
,
u
"##ばんは"
,
u
"、"
,
u
"世界"
,
"。"
])
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
tokens
),
[
3
,
12
,
10
,
14
,
4
,
9
,
12
,
10
,
14
])
def
test_mecab_tokenizer
(
self
):
tokenizer
=
MecabTokenizer
()
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"
\t
アップルストアでiPhone8 が
\n
発売された 。 "
),
[
u
"アップルストア"
,
u
"で"
,
u
"iPhone"
,
u
"8"
,
u
"が"
,
u
"発売"
,
u
"さ"
,
u
"れ"
,
u
"た"
,
u
"。"
])
def
test_mecab_tokenizer_lower
(
self
):
tokenizer
=
MecabTokenizer
(
do_lower_case
=
True
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"
\t
アップルストアでiPhone8 が
\n
発売された 。 "
),
[
u
"アップルストア"
,
u
"で"
,
u
"iphone"
,
u
"8"
,
u
"が"
,
u
"発売"
,
u
"さ"
,
u
"れ"
,
u
"た"
,
u
"。"
])
def
test_mecab_tokenizer_no_normalize
(
self
):
tokenizer
=
MecabTokenizer
(
normalize_text
=
False
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"
\t
アップルストアでiPhone8 が
\n
発売された 。 "
),
[
u
"アップルストア"
,
u
"で"
,
u
"iPhone"
,
u
"8"
,
u
"が"
,
u
"発売"
,
u
"さ"
,
u
"れ"
,
u
"た"
,
u
" "
,
u
"。"
])
def
test_wordpiece_tokenizer
(
self
):
vocab_tokens
=
[
u
"[UNK]"
,
u
"[CLS]"
,
u
"[SEP]"
,
u
"こんにちは"
,
u
"こん"
,
u
"にちは"
u
"ばんは"
,
u
"##こん"
,
u
"##にちは"
,
u
"##ばんは"
]
vocab
=
{}
for
(
i
,
token
)
in
enumerate
(
vocab_tokens
):
vocab
[
token
]
=
i
tokenizer
=
WordpieceTokenizer
(
vocab
=
vocab
,
unk_token
=
u
"[UNK]"
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
""
),
[])
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"こんにちは"
),
[
u
"こんにちは"
])
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"こんばんは"
),
[
u
"こん"
,
u
"##ばんは"
])
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"こんばんは こんばんにちは こんにちは"
),
[
u
"こん"
,
u
"##ばんは"
,
u
"[UNK]"
,
u
"こんにちは"
])
@
slow
def
test_sequence_builders
(
self
):
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
"bert-base-japanese"
)
text
=
tokenizer
.
encode
(
u
"ありがとう。"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
u
"どういたしまして。"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
# 2 is for "[CLS]", 3 is for "[SEP]"
assert
encoded_sentence
==
[
2
]
+
text
+
[
3
]
assert
encoded_pair
==
[
2
]
+
text
+
[
3
]
+
text_2
+
[
3
]
class
BertJapaneseCharacterTokenizationTest
(
CommonTestCases
.
CommonTokenizerTester
):
tokenizer_class
=
BertJapaneseTokenizer
def
setUp
(
self
):
super
(
BertJapaneseCharacterTokenizationTest
,
self
).
setUp
()
vocab_tokens
=
[
u
"[UNK]"
,
u
"[CLS]"
,
u
"[SEP]"
,
u
"こ"
,
u
"ん"
,
u
"に"
,
u
"ち"
,
u
"は"
,
u
"ば"
,
u
"世"
,
u
"界"
,
u
"、"
,
u
"。"
]
self
.
vocab_file
=
os
.
path
.
join
(
self
.
tmpdirname
,
VOCAB_FILES_NAMES
[
"vocab_file"
])
with
open
(
self
.
vocab_file
,
"w"
,
encoding
=
"utf-8"
)
as
vocab_writer
:
vocab_writer
.
write
(
""
.
join
([
x
+
"
\n
"
for
x
in
vocab_tokens
]))
def
get_tokenizer
(
self
,
**
kwargs
):
return
BertJapaneseTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
subword_tokenizer_type
=
"character"
,
**
kwargs
)
def
get_input_output_texts
(
self
):
input_text
=
u
"こんにちは、世界。
\n
こんばんは、世界。"
output_text
=
u
"こ ん に ち は 、 世 界 。 こ ん ば ん は 、 世 界 。"
return
input_text
,
output_text
def
test_full_tokenizer
(
self
):
tokenizer
=
self
.
tokenizer_class
(
self
.
vocab_file
,
subword_tokenizer_type
=
"character"
)
tokens
=
tokenizer
.
tokenize
(
u
"こんにちは、世界。
\n
こんばんは、世界。"
)
self
.
assertListEqual
(
tokens
,
[
u
"こ"
,
u
"ん"
,
u
"に"
,
u
"ち"
,
u
"は"
,
u
"、"
,
u
"世"
,
u
"界"
,
u
"。"
,
u
"こ"
,
u
"ん"
,
u
"ば"
,
u
"ん"
,
u
"は"
,
u
"、"
,
u
"世"
,
u
"界"
,
u
"。"
])
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
tokens
),
[
3
,
4
,
5
,
6
,
7
,
11
,
9
,
10
,
12
,
3
,
4
,
8
,
4
,
7
,
11
,
9
,
10
,
12
])
def
test_character_tokenizer
(
self
):
vocab_tokens
=
[
u
"[UNK]"
,
u
"[CLS]"
,
u
"[SEP]"
,
u
"こ"
,
u
"ん"
,
u
"に"
,
u
"ち"
,
u
"は"
,
u
"ば"
,
u
"世"
,
u
"界"
u
"、"
,
u
"。"
]
vocab
=
{}
for
(
i
,
token
)
in
enumerate
(
vocab_tokens
):
vocab
[
token
]
=
i
tokenizer
=
CharacterTokenizer
(
vocab
=
vocab
,
unk_token
=
u
"[UNK]"
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
""
),
[])
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"こんにちは"
),
[
u
"こ"
,
u
"ん"
,
u
"に"
,
u
"ち"
,
u
"は"
])
self
.
assertListEqual
(
tokenizer
.
tokenize
(
u
"こんにちほ"
),
[
u
"こ"
,
u
"ん"
,
u
"に"
,
u
"ち"
,
u
"[UNK]"
])
@
slow
def
test_sequence_builders
(
self
):
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
"bert-base-japanese-char"
)
text
=
tokenizer
.
encode
(
u
"ありがとう。"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
u
"どういたしまして。"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
# 2 is for "[CLS]", 3 is for "[SEP]"
assert
encoded_sentence
==
[
2
]
+
text
+
[
3
]
assert
encoded_pair
==
[
2
]
+
text
+
[
3
]
+
text_2
+
[
3
]
transformers/tests/utils.py
View file @
386a93f0
...
@@ -16,11 +16,24 @@ except KeyError:
...
@@ -16,11 +16,24 @@ except KeyError:
_run_slow_tests
=
False
_run_slow_tests
=
False
else
:
else
:
# RUN_SLOW is set, convert it to True or False.
# RUN_SLOW is set, convert it to True or False.
def
parse_flag_from_env
(
key
,
default
=
False
):
try
:
try
:
_run_slow_tests
=
strtobool
(
run_slow
)
value
=
os
.
environ
[
key
]
except
ValueError
:
except
KeyError
:
# More values are supported, but let's keep the message simple.
# KEY isn't set, default to `default`.
raise
ValueError
(
"If set, RUN_SLOW must be yes or no."
)
_value
=
default
else
:
# KEY is set, convert it to True or False.
try
:
_value
=
strtobool
(
value
)
except
ValueError
:
# More values are supported, but let's keep the message simple.
raise
ValueError
(
"If set, {} must be yes or no."
.
format
(
key
))
return
_value
_run_slow_tests
=
parse_flag_from_env
(
"RUN_SLOW"
,
default
=
False
)
_run_custom_tokenizers
=
parse_flag_from_env
(
"RUN_CUSTOM_TOKENIZERS"
,
default
=
False
)
def
slow
(
test_case
):
def
slow
(
test_case
):
...
@@ -36,6 +49,19 @@ def slow(test_case):
...
@@ -36,6 +49,19 @@ def slow(test_case):
return
test_case
return
test_case
def
custom_tokenizers
(
test_case
):
"""
Decorator marking a test for a custom tokenizer.
Custom tokenizers require additional dependencies, and are skipped
by default. Set the RUN_CUSTOM_TOKENIZERS environment variable
to a truthy value to run them.
"""
if
not
_run_custom_tokenizers
:
test_case
=
unittest
.
skip
(
"test of custom tokenizers"
)(
test_case
)
return
test_case
def
require_torch
(
test_case
):
def
require_torch
(
test_case
):
"""
"""
Decorator marking a test that requires PyTorch.
Decorator marking a test that requires PyTorch.
...
@@ -62,6 +88,6 @@ def require_tf(test_case):
...
@@ -62,6 +88,6 @@ def require_tf(test_case):
if
_torch_available
:
if
_torch_available
:
# Set the USE_CUDA environment variable to select a GPU.
# Set the USE_CUDA environment variable to select a GPU.
torch_device
=
"cuda"
if
os
.
environ
.
get
(
"USE_CUDA"
)
else
"cpu"
torch_device
=
"cuda"
if
parse_flag_from_env
(
"USE_CUDA"
)
else
"cpu"
else
:
else
:
torch_device
=
None
torch_device
=
None
transformers/tokenization_auto.py
View file @
386a93f0
...
@@ -19,6 +19,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
...
@@ -19,6 +19,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import
logging
import
logging
from
.tokenization_bert
import
BertTokenizer
from
.tokenization_bert
import
BertTokenizer
from
.tokenization_bert_japanese
import
BertJapaneseTokenizer
from
.tokenization_openai
import
OpenAIGPTTokenizer
from
.tokenization_openai
import
OpenAIGPTTokenizer
from
.tokenization_gpt2
import
GPT2Tokenizer
from
.tokenization_gpt2
import
GPT2Tokenizer
from
.tokenization_ctrl
import
CTRLTokenizer
from
.tokenization_ctrl
import
CTRLTokenizer
...
@@ -72,6 +73,7 @@ class AutoTokenizer(object):
...
@@ -72,6 +73,7 @@ class AutoTokenizer(object):
- contains `albert`: AlbertTokenizer (ALBERT model)
- contains `albert`: AlbertTokenizer (ALBERT model)
- contains `camembert`: CamembertTokenizer (CamemBERT model)
- contains `camembert`: CamembertTokenizer (CamemBERT model)
- contains `roberta`: RobertaTokenizer (RoBERTa model)
- contains `roberta`: RobertaTokenizer (RoBERTa model)
- contains `bert-base-japanese`: BertJapaneseTokenizer (Bert model)
- contains `bert`: BertTokenizer (Bert model)
- contains `bert`: BertTokenizer (Bert model)
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
...
@@ -118,6 +120,8 @@ class AutoTokenizer(object):
...
@@ -118,6 +120,8 @@ class AutoTokenizer(object):
return
CamembertTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
return
CamembertTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'roberta'
in
pretrained_model_name_or_path
:
elif
'roberta'
in
pretrained_model_name_or_path
:
return
RobertaTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
return
RobertaTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'bert-base-japanese'
in
pretrained_model_name_or_path
:
return
BertJapaneseTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'bert'
in
pretrained_model_name_or_path
:
elif
'bert'
in
pretrained_model_name_or_path
:
return
BertTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
return
BertTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
*
inputs
,
**
kwargs
)
elif
'openai-gpt'
in
pretrained_model_name_or_path
:
elif
'openai-gpt'
in
pretrained_model_name_or_path
:
...
...
transformers/tokenization_bert_japanese.py
0 → 100644
View file @
386a93f0
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
collections
import
logging
import
os
import
six
import
unicodedata
from
io
import
open
from
.tokenization_bert
import
BertTokenizer
,
BasicTokenizer
,
WordpieceTokenizer
,
load_vocab
from
.tokenization_utils
import
PreTrainedTokenizer
logger
=
logging
.
getLogger
(
__name__
)
VOCAB_FILES_NAMES
=
{
'vocab_file'
:
'vocab.txt'
}
PRETRAINED_VOCAB_FILES_MAP
=
{
'vocab_file'
:
{
'bert-base-japanese'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-vocab.txt"
,
'bert-base-japanese-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-vocab.txt"
,
'bert-base-japanese-char'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-vocab.txt"
,
'bert-base-japanese-char-whole-word-masking'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-vocab.txt"
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
'bert-base-japanese'
:
512
,
'bert-base-japanese-whole-word-masking'
:
512
,
'bert-base-japanese-char'
:
512
,
'bert-base-japanese-char-whole-word-masking'
:
512
}
PRETRAINED_INIT_CONFIGURATION
=
{
'bert-base-japanese'
:
{
'do_lower_case'
:
False
,
'word_tokenizer_type'
:
'mecab'
,
'subword_tokenizer_type'
:
'wordpiece'
},
'bert-base-japanese-whole-word-masking'
:{
'do_lower_case'
:
False
,
'word_tokenizer_type'
:
'mecab'
,
'subword_tokenizer_type'
:
'wordpiece'
},
'bert-base-japanese-char'
:
{
'do_lower_case'
:
False
,
'word_tokenizer_type'
:
'mecab'
,
'subword_tokenizer_type'
:
'character'
},
'bert-base-japanese-char-whole-word-masking'
:
{
'do_lower_case'
:
False
,
'word_tokenizer_type'
:
'mecab'
,
'subword_tokenizer_type'
:
'character'
}
}
class
BertJapaneseTokenizer
(
BertTokenizer
):
"""BERT tokenizer for Japanese text"""
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration
=
PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def
__init__
(
self
,
vocab_file
,
do_lower_case
=
False
,
do_word_tokenize
=
True
,
do_subword_tokenize
=
True
,
word_tokenizer_type
=
'basic'
,
subword_tokenizer_type
=
'wordpiece'
,
never_split
=
None
,
unk_token
=
'[UNK]'
,
sep_token
=
'[SEP]'
,
pad_token
=
'[PAD]'
,
cls_token
=
'[CLS]'
,
mask_token
=
'[MASK]'
,
**
kwargs
):
"""Constructs a MecabBertTokenizer.
Args:
**vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
**do_lower_case**: (`optional`) boolean (default True)
Whether to lower case the input.
Only has an effect when do_basic_tokenize=True.
**do_word_tokenize**: (`optional`) boolean (default True)
Whether to do word tokenization.
**do_subword_tokenize**: (`optional`) boolean (default True)
Whether to do subword tokenization.
**word_tokenizer_type**: (`optional`) string (default "basic")
Type of word tokenizer.
**subword_tokenizer_type**: (`optional`) string (default "wordpiece")
Type of subword tokenizer.
"""
super
(
BertTokenizer
,
self
).
__init__
(
unk_token
=
unk_token
,
sep_token
=
sep_token
,
pad_token
=
pad_token
,
cls_token
=
cls_token
,
mask_token
=
mask_token
,
**
kwargs
)
self
.
max_len_single_sentence
=
self
.
max_len
-
2
# take into account special tokens
self
.
max_len_sentences_pair
=
self
.
max_len
-
3
# take into account special tokens
if
not
os
.
path
.
isfile
(
vocab_file
):
raise
ValueError
(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
.
format
(
vocab_file
))
self
.
vocab
=
load_vocab
(
vocab_file
)
self
.
ids_to_tokens
=
collections
.
OrderedDict
(
[(
ids
,
tok
)
for
tok
,
ids
in
self
.
vocab
.
items
()])
self
.
do_word_tokenize
=
do_word_tokenize
if
do_word_tokenize
:
if
word_tokenizer_type
==
'basic'
:
self
.
word_tokenizer
=
BasicTokenizer
(
do_lower_case
=
do_lower_case
,
never_split
=
never_split
,
tokenize_chinese_chars
=
False
)
elif
word_tokenizer_type
==
'mecab'
:
self
.
word_tokenizer
=
MecabTokenizer
(
do_lower_case
=
do_lower_case
,
never_split
=
never_split
)
else
:
raise
ValueError
(
"Invalid word_tokenizer_type '{}' is specified."
.
format
(
word_tokenizer_type
))
self
.
do_subword_tokenize
=
do_subword_tokenize
if
do_subword_tokenize
:
if
subword_tokenizer_type
==
'wordpiece'
:
self
.
subword_tokenizer
=
WordpieceTokenizer
(
vocab
=
self
.
vocab
,
unk_token
=
self
.
unk_token
)
elif
subword_tokenizer_type
==
'character'
:
self
.
subword_tokenizer
=
CharacterTokenizer
(
vocab
=
self
.
vocab
,
unk_token
=
self
.
unk_token
)
else
:
raise
ValueError
(
"Invalid subword_tokenizer_type '{}' is specified."
.
format
(
subword_tokenizer_type
))
def
_tokenize
(
self
,
text
):
if
self
.
do_word_tokenize
:
tokens
=
self
.
word_tokenizer
.
tokenize
(
text
,
never_split
=
self
.
all_special_tokens
)
else
:
tokens
=
[
text
]
if
self
.
do_subword_tokenize
:
split_tokens
=
[
sub_token
for
token
in
tokens
for
sub_token
in
self
.
subword_tokenizer
.
tokenize
(
token
)]
else
:
split_tokens
=
tokens
return
split_tokens
class
MecabTokenizer
(
object
):
"""Runs basic tokenization with MeCab morphological parser."""
def
__init__
(
self
,
do_lower_case
=
False
,
never_split
=
None
,
normalize_text
=
True
):
"""Constructs a MecabTokenizer.
Args:
**do_lower_case**: (`optional`) boolean (default True)
Whether to lower case the input.
**never_split**: (`optional`) list of str
Kept for backward compatibility purposes.
Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
List of token not to split.
**normalize_text**: (`optional`) boolean (default True)
Whether to apply unicode normalization to text before tokenization.
"""
self
.
do_lower_case
=
do_lower_case
self
.
never_split
=
never_split
if
never_split
is
not
None
else
[]
self
.
normalize_text
=
normalize_text
import
MeCab
self
.
mecab
=
MeCab
.
Tagger
()
def
tokenize
(
self
,
text
,
never_split
=
None
,
**
kwargs
):
"""Tokenizes a piece of text."""
if
self
.
normalize_text
:
text
=
unicodedata
.
normalize
(
'NFKC'
,
text
)
never_split
=
self
.
never_split
+
(
never_split
if
never_split
is
not
None
else
[])
tokens
=
[]
if
six
.
PY2
:
mecab_output
=
self
.
mecab
.
parse
(
text
.
encode
(
'utf-8'
)).
decode
(
'utf-8'
)
else
:
mecab_output
=
self
.
mecab
.
parse
(
text
)
cursor
=
0
for
line
in
mecab_output
.
split
(
'
\n
'
):
if
line
==
'EOS'
:
break
token
,
_
=
line
.
split
(
'
\t
'
)
token_start
=
text
.
index
(
token
,
cursor
)
token_end
=
token_start
+
len
(
token
)
if
self
.
do_lower_case
and
token
not
in
never_split
:
token
=
token
.
lower
()
tokens
.
append
(
token
)
cursor
=
token_end
return
tokens
class
CharacterTokenizer
(
object
):
"""Runs Character tokenziation."""
def
__init__
(
self
,
vocab
,
unk_token
,
normalize_text
=
True
):
"""Constructs a CharacterTokenizer.
Args:
**vocab**:
Vocabulary object.
**unk_token**: str
A special symbol for out-of-vocabulary token.
**normalize_text**: (`optional`) boolean (default True)
Whether to apply unicode normalization to text before tokenization.
"""
self
.
vocab
=
vocab
self
.
unk_token
=
unk_token
self
.
normalize_text
=
normalize_text
def
tokenize
(
self
,
text
):
"""Tokenizes a piece of text into characters.
For example:
input = "apple"
output = ["a", "p", "p", "l", "e"]
Args:
text: A single token or whitespace separated tokens.
This should have already been passed through `BasicTokenizer`.
Returns:
A list of characters.
"""
if
self
.
normalize_text
:
text
=
unicodedata
.
normalize
(
'NFKC'
,
text
)
output_tokens
=
[]
for
i
,
char
in
enumerate
(
text
):
if
char
not
in
self
.
vocab
:
output_tokens
.
append
(
self
.
unk_token
)
continue
output_tokens
.
append
(
char
)
return
output_tokens
transformers/tokenization_utils.py
View file @
386a93f0
...
@@ -923,7 +923,7 @@ class PreTrainedTokenizer(object):
...
@@ -923,7 +923,7 @@ class PreTrainedTokenizer(object):
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
or PyTorch torch.Tensor instead of a list of python integers.
or PyTorch torch.Tensor instead of a list of python integers.
return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
return_attention_mask: (optional) Set to False to avoi
r
returning attention mask (default True)
return_attention_mask: (optional) Set to False to avoi
d
returning attention mask (default True)
return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).
return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).
...
@@ -968,24 +968,13 @@ class PreTrainedTokenizer(object):
...
@@ -968,24 +968,13 @@ class PreTrainedTokenizer(object):
if
add_special_tokens
:
if
add_special_tokens
:
sequence
=
self
.
build_inputs_with_special_tokens
(
ids
,
pair_ids
)
sequence
=
self
.
build_inputs_with_special_tokens
(
ids
,
pair_ids
)
token_type_ids
=
self
.
create_token_type_ids_from_sequences
(
ids
,
pair_ids
)
token_type_ids
=
self
.
create_token_type_ids_from_sequences
(
ids
,
pair_ids
)
special_tokens_mask
=
self
.
get_special_tokens_mask
(
ids
,
pair_ids
)
else
:
else
:
sequence
=
ids
+
pair_ids
if
pair
else
ids
sequence
=
ids
+
pair_ids
if
pair
else
ids
token_type_ids
=
[
0
]
*
len
(
ids
)
+
([
1
]
*
len
(
pair_ids
)
if
pair
else
[])
token_type_ids
=
[
0
]
*
len
(
ids
)
+
([
1
]
*
len
(
pair_ids
)
if
pair
else
[])
special_tokens_mask
=
[
0
]
*
(
len
(
ids
)
+
(
len
(
pair_ids
)
if
pair
else
0
))
if
return_special_tokens_mask
:
if
return_special_tokens_mask
:
encoded_inputs
[
"special_tokens_mask"
]
=
self
.
get_special_tokens_mask
(
ids
,
pair_ids
)
encoded_inputs
[
"special_tokens_mask"
]
=
self
.
get_special_tokens_mask
(
ids
,
pair_ids
)
# Prepare inputs as tensors if asked
if
return_tensors
==
'tf'
and
is_tf_available
():
sequence
=
tf
.
constant
([
sequence
])
token_type_ids
=
tf
.
constant
([
token_type_ids
])
elif
return_tensors
==
'pt'
and
is_torch_available
():
sequence
=
torch
.
tensor
([
sequence
])
token_type_ids
=
torch
.
tensor
([
token_type_ids
])
elif
return_tensors
is
not
None
:
logger
.
warning
(
"Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available."
.
format
(
return_tensors
))
encoded_inputs
[
"input_ids"
]
=
sequence
encoded_inputs
[
"input_ids"
]
=
sequence
if
return_token_type_ids
:
if
return_token_type_ids
:
encoded_inputs
[
"token_type_ids"
]
=
token_type_ids
encoded_inputs
[
"token_type_ids"
]
=
token_type_ids
...
@@ -1022,10 +1011,9 @@ class PreTrainedTokenizer(object):
...
@@ -1022,10 +1011,9 @@ class PreTrainedTokenizer(object):
if
return_special_tokens_mask
:
if
return_special_tokens_mask
:
encoded_inputs
[
"special_tokens_mask"
]
=
encoded_inputs
[
"special_tokens_mask"
]
+
[
1
]
*
difference
encoded_inputs
[
"special_tokens_mask"
]
=
encoded_inputs
[
"special_tokens_mask"
]
+
[
1
]
*
difference
encoded_inputs
[
"input_ids"
]
=
encoded_inputs
[
"input_ids"
]
+
[
self
.
pad_token_id
]
*
difference
encoded_inputs
[
"input_ids"
]
=
encoded_inputs
[
"input_ids"
]
+
[
self
.
pad_token_id
]
*
difference
elif
self
.
padding_side
==
'left'
:
elif
self
.
padding_side
==
'left'
:
if
return_attention_mask
:
if
return_attention_mask
:
encoded_inputs
[
"attention_mask"
]
=
[
0
]
*
difference
+
[
1
]
*
len
(
encoded_inputs
[
"input_ids"
])
encoded_inputs
[
"attention_mask"
]
=
[
0
]
*
difference
+
[
1
]
*
len
(
encoded_inputs
[
"input_ids"
])
if
return_token_type_ids
:
if
return_token_type_ids
:
encoded_inputs
[
"token_type_ids"
]
=
[
self
.
pad_token_type_id
]
*
difference
+
encoded_inputs
[
"token_type_ids"
]
encoded_inputs
[
"token_type_ids"
]
=
[
self
.
pad_token_type_id
]
*
difference
+
encoded_inputs
[
"token_type_ids"
]
if
return_special_tokens_mask
:
if
return_special_tokens_mask
:
...
@@ -1037,7 +1025,26 @@ class PreTrainedTokenizer(object):
...
@@ -1037,7 +1025,26 @@ class PreTrainedTokenizer(object):
elif
return_attention_mask
:
elif
return_attention_mask
:
encoded_inputs
[
"attention_mask"
]
=
[
1
]
*
len
(
encoded_inputs
[
"input_ids"
])
encoded_inputs
[
"attention_mask"
]
=
[
1
]
*
len
(
encoded_inputs
[
"input_ids"
])
# Prepare inputs as tensors if asked
if
return_tensors
==
'tf'
and
is_tf_available
():
encoded_inputs
[
"input_ids"
]
=
tf
.
constant
([
encoded_inputs
[
"input_ids"
]])
encoded_inputs
[
"token_type_ids"
]
=
tf
.
constant
([
encoded_inputs
[
"token_type_ids"
]])
if
"attention_mask"
in
encoded_inputs
:
encoded_inputs
[
"attention_mask"
]
=
tf
.
constant
([
encoded_inputs
[
"attention_mask"
]])
elif
return_tensors
==
'pt'
and
is_torch_available
():
encoded_inputs
[
"input_ids"
]
=
torch
.
tensor
([
encoded_inputs
[
"input_ids"
]])
encoded_inputs
[
"token_type_ids"
]
=
torch
.
tensor
([
encoded_inputs
[
"token_type_ids"
]])
if
"attention_mask"
in
encoded_inputs
:
encoded_inputs
[
"attention_mask"
]
=
torch
.
tensor
([
encoded_inputs
[
"attention_mask"
]])
elif
return_tensors
is
not
None
:
logger
.
warning
(
"Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available."
.
format
(
return_tensors
))
return
encoded_inputs
return
encoded_inputs
def
truncate_sequences
(
self
,
ids
,
pair_ids
=
None
,
num_tokens_to_remove
=
0
,
truncation_strategy
=
'longest_first'
,
stride
=
0
):
def
truncate_sequences
(
self
,
ids
,
pair_ids
=
None
,
num_tokens_to_remove
=
0
,
truncation_strategy
=
'longest_first'
,
stride
=
0
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment