Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
ab49fafc
Commit
ab49fafc
authored
Jul 15, 2019
by
thomwolf
Browse files
update tokenization docstrings for #328
parent
a9ab1517
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
40 additions
and
17 deletions
+40
-17
pytorch_transformers/tokenization_bert.py
pytorch_transformers/tokenization_bert.py
+40
-17
No files found.
pytorch_transformers/tokenization_bert.py
View file @
ab49fafc
...
@@ -104,16 +104,23 @@ class BertTokenizer(PreTrainedTokenizer):
...
@@ -104,16 +104,23 @@ class BertTokenizer(PreTrainedTokenizer):
def
__init__
(
self
,
vocab_file
,
do_lower_case
=
True
,
do_basic_tokenize
=
True
,
never_split
=
None
,
def
__init__
(
self
,
vocab_file
,
do_lower_case
=
True
,
do_basic_tokenize
=
True
,
never_split
=
None
,
unk_token
=
"[UNK]"
,
sep_token
=
"[SEP]"
,
pad_token
=
"[PAD]"
,
cls_token
=
"[CLS]"
,
unk_token
=
"[UNK]"
,
sep_token
=
"[SEP]"
,
pad_token
=
"[PAD]"
,
cls_token
=
"[CLS]"
,
mask_token
=
"[MASK]"
,
**
kwargs
):
mask_token
=
"[MASK]"
,
tokenize_chinese_chars
=
True
,
**
kwargs
):
"""Constructs a BertTokenizer.
"""Constructs a BertTokenizer.
Args:
Args:
vocab_file: Path to a one-wordpiece-per-line vocabulary file
**vocab_file**: Path to a one-wordpiece-per-line vocabulary file
do_lower_case: Whether to lower case the input
**do_lower_case**: (`optional`) boolean (default True)
Only has an effect when do_wordpiece_only=False
Whether to lower case the input
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
Only has an effect when do_basic_tokenize=True
never_split: List of tokens which will never be split during tokenization.
**do_basic_tokenize**: (`optional`) boolean (default True)
Only has an effect when do_wordpiece_only=False
Whether to do basic tokenization before wordpiece.
**never_split**: (`optional`) list of string
List of tokens which will never be split during tokenization.
Only has an effect when do_basic_tokenize=True
**tokenize_chinese_chars**: (`optional`) boolean (default True)
Whether to tokenize Chinese characters.
This should likely be desactivated for Japanese:
see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
"""
"""
super
(
BertTokenizer
,
self
).
__init__
(
unk_token
=
unk_token
,
sep_token
=
sep_token
,
super
(
BertTokenizer
,
self
).
__init__
(
unk_token
=
unk_token
,
sep_token
=
sep_token
,
pad_token
=
pad_token
,
cls_token
=
cls_token
,
pad_token
=
pad_token
,
cls_token
=
cls_token
,
...
@@ -128,7 +135,8 @@ class BertTokenizer(PreTrainedTokenizer):
...
@@ -128,7 +135,8 @@ class BertTokenizer(PreTrainedTokenizer):
self
.
do_basic_tokenize
=
do_basic_tokenize
self
.
do_basic_tokenize
=
do_basic_tokenize
if
do_basic_tokenize
:
if
do_basic_tokenize
:
self
.
basic_tokenizer
=
BasicTokenizer
(
do_lower_case
=
do_lower_case
,
self
.
basic_tokenizer
=
BasicTokenizer
(
do_lower_case
=
do_lower_case
,
never_split
=
never_split
)
never_split
=
never_split
,
tokenize_chinese_chars
=
tokenize_chinese_chars
)
self
.
wordpiece_tokenizer
=
WordpieceTokenizer
(
vocab
=
self
.
vocab
,
unk_token
=
self
.
unk_token
)
self
.
wordpiece_tokenizer
=
WordpieceTokenizer
(
vocab
=
self
.
vocab
,
unk_token
=
self
.
unk_token
)
@
property
@
property
...
@@ -196,21 +204,36 @@ class BertTokenizer(PreTrainedTokenizer):
...
@@ -196,21 +204,36 @@ class BertTokenizer(PreTrainedTokenizer):
class
BasicTokenizer
(
object
):
class
BasicTokenizer
(
object
):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
def
__init__
(
self
,
def
__init__
(
self
,
do_lower_case
=
True
,
never_split
=
None
,
tokenize_chinese_chars
=
True
):
do_lower_case
=
True
,
""" Constructs a BasicTokenizer.
never_split
=
None
):
"""Constructs a BasicTokenizer.
Args:
Args:
do_lower_case: Whether to lower case the input.
**do_lower_case**: Whether to lower case the input.
**never_split**: (`optional`) list of str
Kept for backward compatibility purposes.
Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
List of token not to split.
**tokenize_chinese_chars**: (`optional`) boolean (default True)
Whether to tokenize Chinese characters.
This should likely be desactivated for Japanese:
see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
"""
"""
if
never_split
is
None
:
if
never_split
is
None
:
never_split
=
[]
never_split
=
[]
self
.
do_lower_case
=
do_lower_case
self
.
do_lower_case
=
do_lower_case
self
.
never_split
=
never_split
self
.
never_split
=
never_split
self
.
tokenize_chinese_chars
=
tokenize_chinese_chars
def
tokenize
(
self
,
text
,
never_split
=
None
):
""" Basic Tokenization of a piece of text.
Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.
def
tokenize
(
self
,
text
,
never_split
=
None
,
tokenize_chinese_chars
=
True
):
Args:
"""Tokenizes a piece of text."""
**never_split**: (`optional`) list of str
Kept for backward compatibility purposes.
Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
List of token not to split.
"""
never_split
=
self
.
never_split
+
(
never_split
if
never_split
is
not
None
else
[])
never_split
=
self
.
never_split
+
(
never_split
if
never_split
is
not
None
else
[])
text
=
self
.
_clean_text
(
text
)
text
=
self
.
_clean_text
(
text
)
# This was added on November 1st, 2018 for the multilingual and Chinese
# This was added on November 1st, 2018 for the multilingual and Chinese
...
@@ -219,7 +242,7 @@ class BasicTokenizer(object):
...
@@ -219,7 +242,7 @@ class BasicTokenizer(object):
# and generally don't have any Chinese data in them (there are Chinese
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
# words in the English Wikipedia.).
if
tokenize_chinese_chars
:
if
self
.
tokenize_chinese_chars
:
text
=
self
.
_tokenize_chinese_chars
(
text
)
text
=
self
.
_tokenize_chinese_chars
(
text
)
orig_tokens
=
whitespace_tokenize
(
text
)
orig_tokens
=
whitespace_tokenize
(
text
)
split_tokens
=
[]
split_tokens
=
[]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment