"megatron/git@developer.sourcefind.cn:OpenDAS/megatron-lm.git" did not exist on "6728a780dc76ff9684217033d9127f7b1186230b"
Commit e0eaa1ed authored by Sergey Mironov's avatar Sergey Mironov
Browse files

Update tokenizer: unhardcode alphanumeric char set

parent 7708847e
...@@ -45,12 +45,15 @@ _UNESCAPE_REGEX = re.compile(r"\\u|\\\\|\\([0-9]+);") ...@@ -45,12 +45,15 @@ _UNESCAPE_REGEX = re.compile(r"\\u|\\\\|\\([0-9]+);")
_UNDEFINED_UNICODE = u"\u3013" _UNDEFINED_UNICODE = u"\u3013"
# Set contains all letter and number characters. def alphanumeric_char_set():
_ALPHANUMERIC_CHAR_SET = set( return set(
six.unichr(i) for i in xrange(sys.maxunicode) six.unichr(i) for i in xrange(sys.maxunicode)
if (unicodedata.category(six.unichr(i)).startswith("L") or if (unicodedata.category(six.unichr(i)).startswith("L") or
unicodedata.category(six.unichr(i)).startswith("N"))) unicodedata.category(six.unichr(i)).startswith("N")))
# Set contains all letter and number characters.
_ALPHANUMERIC_CHAR_SET = alphanumeric_char_set()
# min_count is the minimum number of times a subtoken must appear in the data # min_count is the minimum number of times a subtoken must appear in the data
# before before it is added to the vocabulary. The value is found using binary # before before it is added to the vocabulary. The value is found using binary
# search to obtain the target vocabulary size. # search to obtain the target vocabulary size.
...@@ -61,11 +64,15 @@ _MAX_MIN_COUNT = 1000 # max value to use when binary searching for min_count ...@@ -61,11 +64,15 @@ _MAX_MIN_COUNT = 1000 # max value to use when binary searching for min_count
class Subtokenizer(object): class Subtokenizer(object):
"""Encodes and decodes strings to/from integer IDs.""" """Encodes and decodes strings to/from integer IDs."""
def __init__(self, vocab_file, reserved_tokens=None): def __init__(self, vocab_file, reserved_tokens=None,
master_char_set=None):
"""Initializes class, creating a vocab file if data_files is provided.""" """Initializes class, creating a vocab file if data_files is provided."""
tf.compat.v1.logging.info("Initializing Subtokenizer from file %s." % tf.compat.v1.logging.info("Initializing Subtokenizer from file %s." %
vocab_file) vocab_file)
if master_char_set is None:
master_char_set = _ALPHANUMERIC_CHAR_SET
if reserved_tokens is None: if reserved_tokens is None:
reserved_tokens = RESERVED_TOKENS reserved_tokens = RESERVED_TOKENS
...@@ -80,11 +87,13 @@ class Subtokenizer(object): ...@@ -80,11 +87,13 @@ class Subtokenizer(object):
# Create cache to speed up subtokenization # Create cache to speed up subtokenization
self._cache_size = 2 ** 20 self._cache_size = 2 ** 20
self._cache = [(None, None)] * self._cache_size self._cache = [(None, None)] * self._cache_size
self._master_char_set = master_char_set
@staticmethod @staticmethod
def init_from_files( def init_from_files(
vocab_file, files, target_vocab_size, threshold, min_count=None, vocab_file, files, target_vocab_size, threshold, min_count=None,
file_byte_limit=1e6, reserved_tokens=None, correct_strip=True): file_byte_limit=1e6, reserved_tokens=None, correct_strip=True,
master_char_set=None):
"""Create subtoken vocabulary based on files, and save vocab to file. """Create subtoken vocabulary based on files, and save vocab to file.
Args: Args:
...@@ -105,6 +114,8 @@ class Subtokenizer(object): ...@@ -105,6 +114,8 @@ class Subtokenizer(object):
Returns: Returns:
Subtokenizer object Subtokenizer object
""" """
if master_char_set is None:
master_char_set = _ALPHANUMERIC_CHAR_SET
if reserved_tokens is None: if reserved_tokens is None:
reserved_tokens = RESERVED_TOKENS reserved_tokens = RESERVED_TOKENS
...@@ -112,7 +123,7 @@ class Subtokenizer(object): ...@@ -112,7 +123,7 @@ class Subtokenizer(object):
tf.compat.v1.logging.info("Vocab file already exists (%s)" % vocab_file) tf.compat.v1.logging.info("Vocab file already exists (%s)" % vocab_file)
else: else:
tf.compat.v1.logging.info("Begin steps to create subtoken vocabulary...") tf.compat.v1.logging.info("Begin steps to create subtoken vocabulary...")
token_counts = _count_tokens(files, file_byte_limit, correct_strip) token_counts = _count_tokens(files, file_byte_limit, correct_strip, master_char_set)
alphabet = _generate_alphabet_dict(token_counts) alphabet = _generate_alphabet_dict(token_counts)
subtoken_list = _generate_subtokens_with_target_vocab_size( subtoken_list = _generate_subtokens_with_target_vocab_size(
token_counts, alphabet, target_vocab_size, threshold, min_count, token_counts, alphabet, target_vocab_size, threshold, min_count,
...@@ -120,12 +131,12 @@ class Subtokenizer(object): ...@@ -120,12 +131,12 @@ class Subtokenizer(object):
tf.compat.v1.logging.info("Generated vocabulary with %d subtokens." % tf.compat.v1.logging.info("Generated vocabulary with %d subtokens." %
len(subtoken_list)) len(subtoken_list))
_save_vocab_file(vocab_file, subtoken_list) _save_vocab_file(vocab_file, subtoken_list)
return Subtokenizer(vocab_file) return Subtokenizer(vocab_file, master_char_set=master_char_set)
def encode(self, raw_string, add_eos=False): def encode(self, raw_string, add_eos=False):
"""Encodes a string into a list of int subtoken ids.""" """Encodes a string into a list of int subtoken ids."""
ret = [] ret = []
tokens = _split_string_to_tokens(native_to_unicode(raw_string)) tokens = _split_string_to_tokens(native_to_unicode(raw_string),self._master_char_set)
for token in tokens: for token in tokens:
ret.extend(self._token_to_subtoken_ids(token)) ret.extend(self._token_to_subtoken_ids(token))
if add_eos: if add_eos:
...@@ -161,7 +172,8 @@ class Subtokenizer(object): ...@@ -161,7 +172,8 @@ class Subtokenizer(object):
"Subtokens argument passed into decode() must be a list of integers.") "Subtokens argument passed into decode() must be a list of integers.")
return _unicode_to_native( return _unicode_to_native(
_join_tokens_to_string(self._subtoken_ids_to_tokens(subtokens))) _join_tokens_to_string(self._subtoken_ids_to_tokens(subtokens),
self._master_char_set))
def _subtoken_ids_to_tokens(self, subtokens): def _subtoken_ids_to_tokens(self, subtokens):
"""Convert list of int subtoken ids to a list of string tokens.""" """Convert list of int subtoken ids to a list of string tokens."""
...@@ -218,16 +230,16 @@ def _unicode_to_native(s): ...@@ -218,16 +230,16 @@ def _unicode_to_native(s):
return s return s
def _split_string_to_tokens(text): def _split_string_to_tokens(text, master_char_set):
"""Splits text to a list of string tokens.""" """Splits text to a list of string tokens."""
if not text: if not text:
return [] return []
ret = [] ret = []
token_start = 0 token_start = 0
# Classify each character in the input string # Classify each character in the input string
is_alnum = [c in _ALPHANUMERIC_CHAR_SET for c in text] is_master = [c in master_char_set for c in text]
for pos in xrange(1, len(text)): for pos in xrange(1, len(text)):
if is_alnum[pos] != is_alnum[pos - 1]: if is_master[pos] != is_master[pos - 1]:
token = text[token_start:pos] token = text[token_start:pos]
if token != u" " or token_start == 0: if token != u" " or token_start == 0:
ret.append(token) ret.append(token)
...@@ -237,12 +249,12 @@ def _split_string_to_tokens(text): ...@@ -237,12 +249,12 @@ def _split_string_to_tokens(text):
return ret return ret
def _join_tokens_to_string(tokens): def _join_tokens_to_string(tokens, master_char_set):
"""Join a list of string tokens into a single string.""" """Join a list of string tokens into a single string."""
token_is_alnum = [t[0] in _ALPHANUMERIC_CHAR_SET for t in tokens] token_is_master = [t[0] in master_char_set for t in tokens]
ret = [] ret = []
for i, token in enumerate(tokens): for i, token in enumerate(tokens):
if i > 0 and token_is_alnum[i - 1] and token_is_alnum[i]: if i > 0 and token_is_master[i - 1] and token_is_master[i]:
ret.append(u" ") ret.append(u" ")
ret.append(token) ret.append(token)
return "".join(ret) return "".join(ret)
...@@ -324,7 +336,8 @@ def _unescape_token(token): ...@@ -324,7 +336,8 @@ def _unescape_token(token):
return _UNESCAPE_REGEX.sub(match, token) return _UNESCAPE_REGEX.sub(match, token)
def _count_tokens(files, file_byte_limit=1e6, correct_strip=True): def _count_tokens(files, file_byte_limit=1e6, correct_strip=True,
master_char_set=None):
"""Return token counts of words in the files. """Return token counts of words in the files.
Samples file_byte_limit bytes from each file, and counts the words that appear Samples file_byte_limit bytes from each file, and counts the words that appear
...@@ -342,6 +355,9 @@ def _count_tokens(files, file_byte_limit=1e6, correct_strip=True): ...@@ -342,6 +355,9 @@ def _count_tokens(files, file_byte_limit=1e6, correct_strip=True):
Dictionary mapping tokens to the number of times they appear in the sampled Dictionary mapping tokens to the number of times they appear in the sampled
lines from the files. lines from the files.
""" """
if master_char_set is None:
master_char_set = _ALPHANUMERIC_CHAR_SET
token_counts = collections.defaultdict(int) token_counts = collections.defaultdict(int)
for filepath in files: for filepath in files:
...@@ -362,7 +378,8 @@ def _count_tokens(files, file_byte_limit=1e6, correct_strip=True): ...@@ -362,7 +378,8 @@ def _count_tokens(files, file_byte_limit=1e6, correct_strip=True):
counter = 0 counter = 0
# Add words to token counts # Add words to token counts
for token in _split_string_to_tokens(native_to_unicode(line)): for token in _split_string_to_tokens(native_to_unicode(line),
master_char_set):
token_counts[token] += 1 token_counts[token] += 1
return token_counts return token_counts
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment