"vscode:/vscode.git/clone" did not exist on "c81ebd1c39e3e1bc017a3affbba096dc9aedb5a0"
Unverified Commit 5739726f authored by Connor Henderson's avatar Connor Henderson Committed by GitHub
Browse files

fix: Text splitting in the BasicTokenizer (#22280)

* fix: Apostraphe splitting in the BasicTokenizer for CLIPTokenizer

* account for apostrophe at start of new word

* remove _run_split_on_punc, use re.findall instead

* remove debugging, make style and quality

* use pattern and punc splitting, repo-consistency will fail

* remove commented out debugging

* adds bool args to BasicTokenizer, remove pattern

* do_split_on_punc default True

* clean stray comments and line breaks

* rebase, repo-consistency

* update to just do punctuation split

* add unicode normalizing back

* remove redundant line
parent 2489e380
......@@ -385,20 +385,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
......@@ -417,7 +427,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.).
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
# prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
......@@ -445,7 +457,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
......
......@@ -748,20 +748,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
......@@ -780,7 +790,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.).
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
# prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
......@@ -808,7 +820,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
......
......@@ -126,20 +126,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
......@@ -158,7 +168,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.).
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
# prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
......@@ -186,7 +198,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
......@@ -316,7 +328,7 @@ class CLIPTokenizer(PreTrainedTokenizer):
self.fix_text = ftfy.fix_text
except ImportError:
logger.info("ftfy or spacy is not installed using custom BasicTokenizer instead of ftfy.")
self.nlp = BasicTokenizer(do_lower_case=True)
self.nlp = BasicTokenizer(strip_accents=False, do_split_on_punc=False)
self.fix_text = None
with open(vocab_file, encoding="utf-8") as vocab_handle:
......
......@@ -325,20 +325,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
......@@ -357,7 +367,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.).
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
# prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
......@@ -385,7 +397,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
......
......@@ -350,20 +350,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
......@@ -382,7 +392,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.).
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
# prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
......@@ -410,7 +422,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
......
......@@ -342,20 +342,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
......@@ -374,7 +384,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.).
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
# prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
......@@ -402,7 +414,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
......
......@@ -359,20 +359,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
......@@ -391,7 +401,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.).
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
# prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
......@@ -419,7 +431,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
......
......@@ -143,20 +143,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
......@@ -175,7 +185,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.).
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
# prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
......@@ -203,7 +215,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
......
......@@ -324,20 +324,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
......@@ -356,7 +366,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.).
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
# prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
......@@ -384,7 +396,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
......
......@@ -1362,20 +1362,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
......@@ -1394,7 +1404,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.).
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
# prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
......@@ -1422,7 +1434,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
......
......@@ -316,20 +316,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
......@@ -348,7 +358,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.).
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
# prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
......@@ -376,7 +388,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
......
......@@ -314,20 +314,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
......@@ -346,7 +356,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.).
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
# prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
......@@ -374,7 +386,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
......
......@@ -342,20 +342,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
......@@ -374,7 +384,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.).
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
# prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
......@@ -402,7 +414,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
......
......@@ -71,20 +71,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
......@@ -103,7 +113,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.).
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
# prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
......@@ -131,7 +143,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
......
......@@ -72,20 +72,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
......@@ -104,7 +114,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.).
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
# prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
......@@ -132,7 +144,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
......
......@@ -333,20 +333,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
......@@ -365,7 +375,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.).
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
# prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
......@@ -393,7 +405,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
......
......@@ -931,20 +931,30 @@ class RoCBertBasicTokenizer(object):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
......@@ -963,7 +973,9 @@ class RoCBertBasicTokenizer(object):
# words in the English Wikipedia.).
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
# prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
......@@ -991,7 +1003,7 @@ class RoCBertBasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
......
......@@ -107,20 +107,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
......@@ -139,7 +149,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.).
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
# prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
......@@ -167,7 +179,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
......
......@@ -328,20 +328,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
......@@ -360,7 +370,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.).
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
# prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
......@@ -388,7 +400,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
......
......@@ -2055,20 +2055,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
"""
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
Args:
never_split (`List[str]`, *optional*)
......@@ -2087,7 +2097,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.).
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
# prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
for token in orig_tokens:
if token not in never_split:
......@@ -2115,7 +2127,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
chars = list(text)
i = 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment