Unverified Commit 5739726f authored by Connor Henderson's avatar Connor Henderson Committed by GitHub
Browse files

fix: Text splitting in the BasicTokenizer (#22280)

* fix: Apostraphe splitting in the BasicTokenizer for CLIPTokenizer

* account for apostrophe at start of new word

* remove _run_split_on_punc, use re.findall instead

* remove debugging, make style and quality

* use pattern and punc splitting, repo-consistency will fail

* remove commented out debugging

* adds bool args to BasicTokenizer, remove pattern

* do_split_on_punc default True

* clean stray comments and line breaks

* rebase, repo-consistency

* update to just do punctuation split

* add unicode normalizing back

* remove redundant line
parent 2489e380
...@@ -385,20 +385,30 @@ class BasicTokenizer(object): ...@@ -385,20 +385,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
...@@ -417,7 +427,9 @@ class BasicTokenizer(object): ...@@ -417,7 +427,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
...@@ -445,7 +457,7 @@ class BasicTokenizer(object): ...@@ -445,7 +457,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0
......
...@@ -748,20 +748,30 @@ class BasicTokenizer(object): ...@@ -748,20 +748,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
...@@ -780,7 +790,9 @@ class BasicTokenizer(object): ...@@ -780,7 +790,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
...@@ -808,7 +820,7 @@ class BasicTokenizer(object): ...@@ -808,7 +820,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0
......
...@@ -126,20 +126,30 @@ class BasicTokenizer(object): ...@@ -126,20 +126,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
...@@ -158,7 +168,9 @@ class BasicTokenizer(object): ...@@ -158,7 +168,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
...@@ -186,7 +198,7 @@ class BasicTokenizer(object): ...@@ -186,7 +198,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0
...@@ -316,7 +328,7 @@ class CLIPTokenizer(PreTrainedTokenizer): ...@@ -316,7 +328,7 @@ class CLIPTokenizer(PreTrainedTokenizer):
self.fix_text = ftfy.fix_text self.fix_text = ftfy.fix_text
except ImportError: except ImportError:
logger.info("ftfy or spacy is not installed using custom BasicTokenizer instead of ftfy.") logger.info("ftfy or spacy is not installed using custom BasicTokenizer instead of ftfy.")
self.nlp = BasicTokenizer(do_lower_case=True) self.nlp = BasicTokenizer(strip_accents=False, do_split_on_punc=False)
self.fix_text = None self.fix_text = None
with open(vocab_file, encoding="utf-8") as vocab_handle: with open(vocab_file, encoding="utf-8") as vocab_handle:
......
...@@ -325,20 +325,30 @@ class BasicTokenizer(object): ...@@ -325,20 +325,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
...@@ -357,7 +367,9 @@ class BasicTokenizer(object): ...@@ -357,7 +367,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
...@@ -385,7 +397,7 @@ class BasicTokenizer(object): ...@@ -385,7 +397,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0
......
...@@ -350,20 +350,30 @@ class BasicTokenizer(object): ...@@ -350,20 +350,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
...@@ -382,7 +392,9 @@ class BasicTokenizer(object): ...@@ -382,7 +392,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
...@@ -410,7 +422,7 @@ class BasicTokenizer(object): ...@@ -410,7 +422,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0
......
...@@ -342,20 +342,30 @@ class BasicTokenizer(object): ...@@ -342,20 +342,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
...@@ -374,7 +384,9 @@ class BasicTokenizer(object): ...@@ -374,7 +384,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
...@@ -402,7 +414,7 @@ class BasicTokenizer(object): ...@@ -402,7 +414,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0
......
...@@ -359,20 +359,30 @@ class BasicTokenizer(object): ...@@ -359,20 +359,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
...@@ -391,7 +401,9 @@ class BasicTokenizer(object): ...@@ -391,7 +401,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
...@@ -419,7 +431,7 @@ class BasicTokenizer(object): ...@@ -419,7 +431,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0
......
...@@ -143,20 +143,30 @@ class BasicTokenizer(object): ...@@ -143,20 +143,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
...@@ -175,7 +185,9 @@ class BasicTokenizer(object): ...@@ -175,7 +185,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
...@@ -203,7 +215,7 @@ class BasicTokenizer(object): ...@@ -203,7 +215,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0
......
...@@ -324,20 +324,30 @@ class BasicTokenizer(object): ...@@ -324,20 +324,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
...@@ -356,7 +366,9 @@ class BasicTokenizer(object): ...@@ -356,7 +366,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
...@@ -384,7 +396,7 @@ class BasicTokenizer(object): ...@@ -384,7 +396,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0
......
...@@ -1362,20 +1362,30 @@ class BasicTokenizer(object): ...@@ -1362,20 +1362,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
...@@ -1394,7 +1404,9 @@ class BasicTokenizer(object): ...@@ -1394,7 +1404,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
...@@ -1422,7 +1434,7 @@ class BasicTokenizer(object): ...@@ -1422,7 +1434,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0
......
...@@ -316,20 +316,30 @@ class BasicTokenizer(object): ...@@ -316,20 +316,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
...@@ -348,7 +358,9 @@ class BasicTokenizer(object): ...@@ -348,7 +358,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
...@@ -376,7 +388,7 @@ class BasicTokenizer(object): ...@@ -376,7 +388,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0
......
...@@ -314,20 +314,30 @@ class BasicTokenizer(object): ...@@ -314,20 +314,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
...@@ -346,7 +356,9 @@ class BasicTokenizer(object): ...@@ -346,7 +356,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
...@@ -374,7 +386,7 @@ class BasicTokenizer(object): ...@@ -374,7 +386,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0
......
...@@ -342,20 +342,30 @@ class BasicTokenizer(object): ...@@ -342,20 +342,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
...@@ -374,7 +384,9 @@ class BasicTokenizer(object): ...@@ -374,7 +384,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
...@@ -402,7 +414,7 @@ class BasicTokenizer(object): ...@@ -402,7 +414,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0
......
...@@ -71,20 +71,30 @@ class BasicTokenizer(object): ...@@ -71,20 +71,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
...@@ -103,7 +113,9 @@ class BasicTokenizer(object): ...@@ -103,7 +113,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
...@@ -131,7 +143,7 @@ class BasicTokenizer(object): ...@@ -131,7 +143,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0
......
...@@ -72,20 +72,30 @@ class BasicTokenizer(object): ...@@ -72,20 +72,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
...@@ -104,7 +114,9 @@ class BasicTokenizer(object): ...@@ -104,7 +114,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
...@@ -132,7 +144,7 @@ class BasicTokenizer(object): ...@@ -132,7 +144,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0
......
...@@ -333,20 +333,30 @@ class BasicTokenizer(object): ...@@ -333,20 +333,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
...@@ -365,7 +375,9 @@ class BasicTokenizer(object): ...@@ -365,7 +375,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
...@@ -393,7 +405,7 @@ class BasicTokenizer(object): ...@@ -393,7 +405,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0
......
...@@ -931,20 +931,30 @@ class RoCBertBasicTokenizer(object): ...@@ -931,20 +931,30 @@ class RoCBertBasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
...@@ -963,7 +973,9 @@ class RoCBertBasicTokenizer(object): ...@@ -963,7 +973,9 @@ class RoCBertBasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
...@@ -991,7 +1003,7 @@ class RoCBertBasicTokenizer(object): ...@@ -991,7 +1003,7 @@ class RoCBertBasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0
......
...@@ -107,20 +107,30 @@ class BasicTokenizer(object): ...@@ -107,20 +107,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
...@@ -139,7 +149,9 @@ class BasicTokenizer(object): ...@@ -139,7 +149,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
...@@ -167,7 +179,7 @@ class BasicTokenizer(object): ...@@ -167,7 +179,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0
......
...@@ -328,20 +328,30 @@ class BasicTokenizer(object): ...@@ -328,20 +328,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
...@@ -360,7 +370,9 @@ class BasicTokenizer(object): ...@@ -360,7 +370,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
...@@ -388,7 +400,7 @@ class BasicTokenizer(object): ...@@ -388,7 +400,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0
......
...@@ -2055,20 +2055,30 @@ class BasicTokenizer(object): ...@@ -2055,20 +2055,30 @@ class BasicTokenizer(object):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
do_split_on_punc (`bool`, *optional*, defaults to `True`):
In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
the full context of the words, such as contractions.
""" """
def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
if never_split is None: if never_split is None:
never_split = [] never_split = []
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.never_split = set(never_split) self.never_split = set(never_split)
self.tokenize_chinese_chars = tokenize_chinese_chars self.tokenize_chinese_chars = tokenize_chinese_chars
self.strip_accents = strip_accents self.strip_accents = strip_accents
self.do_split_on_punc = do_split_on_punc
def tokenize(self, text, never_split=None): def tokenize(self, text, never_split=None):
""" """
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
WordPieceTokenizer.
Args: Args:
never_split (`List[str]`, *optional*) never_split (`List[str]`, *optional*)
...@@ -2087,7 +2097,9 @@ class BasicTokenizer(object): ...@@ -2087,7 +2097,9 @@ class BasicTokenizer(object):
# words in the English Wikipedia.). # words in the English Wikipedia.).
if self.tokenize_chinese_chars: if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text) text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text) # prevents treating the same character with different unicode codepoints as different characters
unicode_normalized_text = unicodedata.normalize("NFC", text)
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = [] split_tokens = []
for token in orig_tokens: for token in orig_tokens:
if token not in never_split: if token not in never_split:
...@@ -2115,7 +2127,7 @@ class BasicTokenizer(object): ...@@ -2115,7 +2127,7 @@ class BasicTokenizer(object):
def _run_split_on_punc(self, text, never_split=None): def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text.""" """Splits punctuation on a piece of text."""
if never_split is not None and text in never_split: if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text] return [text]
chars = list(text) chars = list(text)
i = 0 i = 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment