Unverified Commit 00c4e395 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Merge branch 'master' into squad-refactor

parents e9217da5 0cb16386
...@@ -72,7 +72,7 @@ def bytes_to_unicode(): ...@@ -72,7 +72,7 @@ def bytes_to_unicode():
""" """
Returns list of utf-8 byte and a mapping to unicode strings. Returns list of utf-8 byte and a mapping to unicode strings.
We specifically avoids mapping to whitespace/control characters the bpe code barfs on. We specifically avoids mapping to whitespace/control characters the bpe code barfs on.
The reversible bpe codes work on unicode strings. The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
...@@ -107,10 +107,10 @@ class GPT2Tokenizer(PreTrainedTokenizer): ...@@ -107,10 +107,10 @@ class GPT2Tokenizer(PreTrainedTokenizer):
""" """
GPT-2 BPE tokenizer. Peculiarities: GPT-2 BPE tokenizer. Peculiarities:
- Byte-level Byte-Pair-Encoding - Byte-level Byte-Pair-Encoding
- Requires a space to start the input string => the encoding methods should be called with the - Requires a space to start the input string => the encoding and tokenize methods should be called with the
``add_prefix_space`` flag set to ``True``. ``add_prefix_space`` flag set to ``True``.
Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve Otherwise, this tokenizer's ``encode``, ``decode``, and ``tokenize`` methods will not conserve
the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"` the spaces at the beginning of a string: `tokenizer.decode(tokenizer.encode(" Hello")) = "Hello"`
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
...@@ -122,13 +122,15 @@ class GPT2Tokenizer(PreTrainedTokenizer): ...@@ -122,13 +122,15 @@ class GPT2Tokenizer(PreTrainedTokenizer):
self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
self.encoder = json.load(open(vocab_file, encoding="utf-8")) with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()} self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode() self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] with open(merges_file, encoding='utf-8') as merges_handle:
bpe_merges = [tuple(merge.split()) for merge in bpe_data] bpe_merges = merges_handle.read().split('\n')[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {} self.cache = {}
...@@ -184,7 +186,7 @@ class GPT2Tokenizer(PreTrainedTokenizer): ...@@ -184,7 +186,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
""" Tokenize a string. """ Tokenize a string.
Args: Args:
- add_prefix_space (boolean, default False): - add_prefix_space (boolean, default False):
Begin the sentence with at least one space toto get invariance to word order in GPT-2 (and RoBERTa) tokenizers. Begin the sentence with at least one space to get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
""" """
if add_prefix_space: if add_prefix_space:
text = ' ' + text text = ' ' + text
...@@ -234,4 +236,4 @@ class GPT2Tokenizer(PreTrainedTokenizer): ...@@ -234,4 +236,4 @@ class GPT2Tokenizer(PreTrainedTokenizer):
writer.write(' '.join(bpe_tokens) + u'\n') writer.write(' '.join(bpe_tokens) + u'\n')
index += 1 index += 1
return vocab_file, merge_file return vocab_file, merge_file
\ No newline at end of file
...@@ -101,9 +101,11 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): ...@@ -101,9 +101,11 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
self.nlp = BasicTokenizer(do_lower_case=True) self.nlp = BasicTokenizer(do_lower_case=True)
self.fix_text = None self.fix_text = None
self.encoder = json.load(open(vocab_file, encoding="utf-8")) with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v:k for k,v in self.encoder.items()} self.decoder = {v:k for k,v in self.encoder.items()}
merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] with open(merges_file, encoding='utf-8') as merges_handle:
merges = merges_handle.read().split('\n')[1:-1]
merges = [tuple(merge.split()) for merge in merges] merges = [tuple(merge.split()) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {} self.cache = {}
......
...@@ -22,6 +22,7 @@ import json ...@@ -22,6 +22,7 @@ import json
import six import six
import copy import copy
import itertools import itertools
import re
from io import open from io import open
from .file_utils import cached_path, is_tf_available, is_torch_available from .file_utils import cached_path, is_tf_available, is_torch_available
...@@ -263,6 +264,9 @@ class PreTrainedTokenizer(object): ...@@ -263,6 +264,9 @@ class PreTrainedTokenizer(object):
force_download: (`optional`) boolean, default False: force_download: (`optional`) boolean, default False:
Force to (re-)download the vocabulary files and override the cached versions if they exists. Force to (re-)download the vocabulary files and override the cached versions if they exists.
resume_download: (`optional`) boolean, default False:
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
proxies: (`optional`) dict, default None: proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request. The proxies are used on each request.
...@@ -298,6 +302,7 @@ class PreTrainedTokenizer(object): ...@@ -298,6 +302,7 @@ class PreTrainedTokenizer(object):
def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs): def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
cache_dir = kwargs.pop('cache_dir', None) cache_dir = kwargs.pop('cache_dir', None)
force_download = kwargs.pop('force_download', False) force_download = kwargs.pop('force_download', False)
resume_download = kwargs.pop('resume_download', False)
proxies = kwargs.pop('proxies', None) proxies = kwargs.pop('proxies', None)
s3_models = list(cls.max_model_input_sizes.keys()) s3_models = list(cls.max_model_input_sizes.keys())
...@@ -354,7 +359,7 @@ class PreTrainedTokenizer(object): ...@@ -354,7 +359,7 @@ class PreTrainedTokenizer(object):
"We assumed '{}' was a path or url to a directory containing vocabulary files " "We assumed '{}' was a path or url to a directory containing vocabulary files "
"named {} but couldn't find such vocabulary files at this path or url.".format( "named {} but couldn't find such vocabulary files at this path or url.".format(
pretrained_model_name_or_path, ', '.join(s3_models), pretrained_model_name_or_path, ', '.join(s3_models),
pretrained_model_name_or_path, pretrained_model_name_or_path,
list(cls.vocab_files_names.values()))) list(cls.vocab_files_names.values())))
# Get files from url, cache, or disk depending on the case # Get files from url, cache, or disk depending on the case
...@@ -364,7 +369,7 @@ class PreTrainedTokenizer(object): ...@@ -364,7 +369,7 @@ class PreTrainedTokenizer(object):
if file_path is None: if file_path is None:
resolved_vocab_files[file_id] = None resolved_vocab_files[file_id] = None
else: else:
resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies) resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download)
except EnvironmentError: except EnvironmentError:
if pretrained_model_name_or_path in s3_models: if pretrained_model_name_or_path in s3_models:
msg = "Couldn't reach server at '{}' to download vocabulary files." msg = "Couldn't reach server at '{}' to download vocabulary files."
...@@ -389,7 +394,8 @@ class PreTrainedTokenizer(object): ...@@ -389,7 +394,8 @@ class PreTrainedTokenizer(object):
# Did we saved some inputs and kwargs to reload ? # Did we saved some inputs and kwargs to reload ?
tokenizer_config_file = resolved_vocab_files.pop('tokenizer_config_file', None) tokenizer_config_file = resolved_vocab_files.pop('tokenizer_config_file', None)
if tokenizer_config_file is not None: if tokenizer_config_file is not None:
init_kwargs = json.load(open(tokenizer_config_file, encoding="utf-8")) with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
init_kwargs = json.load(tokenizer_config_handle)
saved_init_inputs = init_kwargs.pop('init_inputs', ()) saved_init_inputs = init_kwargs.pop('init_inputs', ())
if not init_inputs: if not init_inputs:
init_inputs = saved_init_inputs init_inputs = saved_init_inputs
...@@ -414,7 +420,8 @@ class PreTrainedTokenizer(object): ...@@ -414,7 +420,8 @@ class PreTrainedTokenizer(object):
if args_name not in init_kwargs: if args_name not in init_kwargs:
init_kwargs[args_name] = file_path init_kwargs[args_name] = file_path
if special_tokens_map_file is not None: if special_tokens_map_file is not None:
special_tokens_map = json.load(open(special_tokens_map_file, encoding="utf-8")) with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
special_tokens_map = json.load(special_tokens_map_handle)
for key, value in special_tokens_map.items(): for key, value in special_tokens_map.items():
if key not in init_kwargs: if key not in init_kwargs:
init_kwargs[key] = value init_kwargs[key] = value
...@@ -428,7 +435,8 @@ class PreTrainedTokenizer(object): ...@@ -428,7 +435,8 @@ class PreTrainedTokenizer(object):
# Add supplementary tokens. # Add supplementary tokens.
if added_tokens_file is not None: if added_tokens_file is not None:
added_tok_encoder = json.load(open(added_tokens_file, encoding="utf-8")) with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
added_tok_encoder = json.load(added_tokens_handle)
added_tok_decoder = {v:k for k, v in added_tok_encoder.items()} added_tok_decoder = {v:k for k, v in added_tok_encoder.items()}
tokenizer.added_tokens_encoder.update(added_tok_encoder) tokenizer.added_tokens_encoder.update(added_tok_encoder)
tokenizer.added_tokens_decoder.update(added_tok_decoder) tokenizer.added_tokens_decoder.update(added_tok_decoder)
...@@ -524,6 +532,8 @@ class PreTrainedTokenizer(object): ...@@ -524,6 +532,8 @@ class PreTrainedTokenizer(object):
to_add_tokens = [] to_add_tokens = []
for token in new_tokens: for token in new_tokens:
assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode)) assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))
if self.init_kwargs.get('do_lower_case', False) and token not in self.all_special_tokens:
token = token.lower()
if token != self.unk_token and \ if token != self.unk_token and \
self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and \ self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and \
token not in to_add_tokens: token not in to_add_tokens:
...@@ -621,6 +631,19 @@ class PreTrainedTokenizer(object): ...@@ -621,6 +631,19 @@ class PreTrainedTokenizer(object):
return_tokens_mapped_to_origin: (optional) Set to True to return the index of each token in the initial whitespace tokenization. (default False). return_tokens_mapped_to_origin: (optional) Set to True to return the index of each token in the initial whitespace tokenization. (default False).
**kwargs: passed to the child `self.tokenize()` method **kwargs: passed to the child `self.tokenize()` method
""" """
def lowercase_text(t):
# convert non-special tokens to lowercase
escaped_special_toks = [re.escape(s_tok) for s_tok in self.all_special_tokens]
pattern = r'(^' + r'|'.join(escaped_special_toks) + r')|' + \
r'(.+?)'
return re.sub(
pattern,
lambda m: m.groups()[0] or m.groups()[1].lower(),
t)
if self.init_kwargs.get('do_lower_case', False):
text = lowercase_text(text)
def split_on_token(tok, text): def split_on_token(tok, text):
result = [] result = []
split_text = text.split(tok) split_text = text.split(tok)
...@@ -823,7 +846,6 @@ class PreTrainedTokenizer(object): ...@@ -823,7 +846,6 @@ class PreTrainedTokenizer(object):
``input_ids``: list of token ids to be fed to a model ``input_ids``: list of token ids to be fed to a model
``token_type_ids``: list of token type ids to be fed to a model ``token_type_ids``: list of token type ids to be fed to a model
``attention_mask``: list of indices specifying which tokens should be attended to by the model ``attention_mask``: list of indices specifying which tokens should be attended to by the model
``overflowing_tokens``: list of overflowing tokens if a max length is specified. ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
...@@ -904,15 +926,18 @@ class PreTrainedTokenizer(object): ...@@ -904,15 +926,18 @@ class PreTrainedTokenizer(object):
{ {
input_ids: list[int], input_ids: list[int],
overflowing_tokens: list[int] if a ``max_length`` is specified, else None token_type_ids: list[int] if return_token_type_ids is True (default)
special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True
num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True
special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
} }
With the fields: With the fields:
``input_ids``: list of tokens to be fed to a model ``input_ids``: list of token ids to be fed to a model
``token_type_ids``: list of token type ids to be fed to a model
``overflowing_tokens``: list of overflowing tokens if a max length is specified. ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
tokens and 1 specifying sequence tokens. tokens and 1 specifying sequence tokens.
""" """
...@@ -921,23 +946,31 @@ class PreTrainedTokenizer(object): ...@@ -921,23 +946,31 @@ class PreTrainedTokenizer(object):
len_pair_ids = len(pair_ids) if pair else 0 len_pair_ids = len(pair_ids) if pair else 0
encoded_inputs = {} encoded_inputs = {}
# Handle max sequence length
total_len = len_ids + len_pair_ids + (self.num_added_tokens(pair=pair) if add_special_tokens else 0) total_len = len_ids + len_pair_ids + (self.num_added_tokens(pair=pair) if add_special_tokens else 0)
if max_length and total_len > max_length: if max_length and total_len > max_length:
ids, pair_ids, overflowing_tokens = self.truncate_sequences(ids, pair_ids=pair_ids, ids, pair_ids, overflowing_tokens = self.truncate_sequences(ids, pair_ids=pair_ids,
num_tokens_to_remove=total_len-max_length, num_tokens_to_remove=total_len-max_length,
truncation_strategy=truncation_strategy, truncation_strategy=truncation_strategy,
stride=stride) stride=stride)
encoded_inputs["overflowing_tokens"] = overflowing_tokens if return_overflowing_tokens:
encoded_inputs["num_truncated_tokens"] = total_len - max_length encoded_inputs["overflowing_tokens"] = overflowing_tokens
encoded_inputs["num_truncated_tokens"] = total_len - max_length
# Handle special_tokens
if add_special_tokens: if add_special_tokens:
sequence = self.build_inputs_with_special_tokens(ids, pair_ids) sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids) special_tokens_mask = self.get_special_tokens_mask(ids, pair_ids)
else: else:
sequence = ids + pair_ids if pair else ids sequence = ids + pair_ids if pair else ids
token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
special_tokens_mask = [0] * (len(ids) + (len(pair_ids) if pair else 0))
if return_special_tokens_mask:
encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
# Prepare inputs as tensors if asked
if return_tensors == 'tf' and is_tf_available(): if return_tensors == 'tf' and is_tf_available():
sequence = tf.constant([sequence]) sequence = tf.constant([sequence])
token_type_ids = tf.constant([token_type_ids]) token_type_ids = tf.constant([token_type_ids])
...@@ -948,12 +981,15 @@ class PreTrainedTokenizer(object): ...@@ -948,12 +981,15 @@ class PreTrainedTokenizer(object):
logger.warning("Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors)) logger.warning("Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors))
encoded_inputs["input_ids"] = sequence encoded_inputs["input_ids"] = sequence
encoded_inputs["token_type_ids"] = token_type_ids if return_token_type_ids:
encoded_inputs["token_type_ids"] = token_type_ids
if max_length and len(encoded_inputs["input_ids"]) > max_length: if max_length and len(encoded_inputs["input_ids"]) > max_length:
encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length] encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length]
encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length] if return_token_type_ids:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length] encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length]
if return_special_tokens_mask:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length]
if max_length is None and len(encoded_inputs["input_ids"]) > self.max_len: if max_length is None and len(encoded_inputs["input_ids"]) > self.max_len:
logger.warning("Token indices sequence length is longer than the specified maximum sequence length " logger.warning("Token indices sequence length is longer than the specified maximum sequence length "
...@@ -995,7 +1031,7 @@ class PreTrainedTokenizer(object): ...@@ -995,7 +1031,7 @@ class PreTrainedTokenizer(object):
elif return_attention_mask: elif return_attention_mask:
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
return encoded_inputs return encoded_inputs
def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0): def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0):
...@@ -1039,7 +1075,6 @@ class PreTrainedTokenizer(object): ...@@ -1039,7 +1075,6 @@ class PreTrainedTokenizer(object):
return (ids, pair_ids, overflowing_tokens) return (ids, pair_ids, overflowing_tokens)
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
logger.warning("This tokenizer does not make use of special tokens.")
if token_ids_1 is None: if token_ids_1 is None:
return len(token_ids_0) * [0] return len(token_ids_0) * [0]
return [0] * len(token_ids_0) + [1] * len(token_ids_1) return [0] * len(token_ids_0) + [1] * len(token_ids_1)
...@@ -1052,7 +1087,6 @@ class PreTrainedTokenizer(object): ...@@ -1052,7 +1087,6 @@ class PreTrainedTokenizer(object):
single sequence: <s> X </s> single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s> pair of sequences: <s> A </s></s> B </s>
""" """
logger.warning("This tokenizer does not make use of special tokens. Input is returned with no modification.")
if token_ids_1 is None: if token_ids_1 is None:
return token_ids_0 return token_ids_0
return token_ids_0 + token_ids_1 return token_ids_0 + token_ids_1
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Tokenization classes for OpenAI GPT.""" """Tokenization classes for XLM."""
from __future__ import (absolute_import, division, print_function, from __future__ import (absolute_import, division, print_function,
unicode_literals) unicode_literals)
...@@ -524,7 +524,7 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -524,7 +524,7 @@ class XLMTokenizer(PreTrainedTokenizer):
- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \ - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
(ex: "__classify__") to a vocabulary (ex: "__classify__") to a vocabulary
- `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies) - `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies)
- `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies) - `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies)
...@@ -564,9 +564,11 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -564,9 +564,11 @@ class XLMTokenizer(PreTrainedTokenizer):
self.ja_word_tokenizer = None self.ja_word_tokenizer = None
self.zh_word_tokenizer = None self.zh_word_tokenizer = None
self.encoder = json.load(open(vocab_file, encoding="utf-8")) with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v:k for k,v in self.encoder.items()} self.decoder = {v:k for k,v in self.encoder.items()}
merges = open(merges_file, encoding='utf-8').read().split('\n')[:-1] with open(merges_file, encoding='utf-8') as merges_handle:
merges = merges_handle.read().split('\n')[:-1]
merges = [tuple(merge.split()[:2]) for merge in merges] merges = [tuple(merge.split()[:2]) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {} self.cache = {}
...@@ -758,9 +760,9 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -758,9 +760,9 @@ class XLMTokenizer(PreTrainedTokenizer):
""" """
Build model inputs from a sequence or a pair of sequence for sequence classification tasks Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens. by concatenating and adding special tokens.
A RoBERTa sequence has the following format: A XLM sequence has the following format:
single sequence: <s> X </s> single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s> pair of sequences: <s> A </s> B </s>
""" """
if token_ids_1 is None: if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
......
...@@ -143,7 +143,7 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -143,7 +143,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
new_pieces = [] new_pieces = []
for piece in pieces: for piece in pieces:
if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit(): if len(piece) > 1 and piece[-1] == str(',') and piece[-2].isdigit():
cur_pieces = self.sp_model.EncodeAsPieces( cur_pieces = self.sp_model.EncodeAsPieces(
piece[:-1].replace(SPIECE_UNDERLINE, '')) piece[:-1].replace(SPIECE_UNDERLINE, ''))
if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE: if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
...@@ -187,9 +187,9 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -187,9 +187,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
""" """
Build model inputs from a sequence or a pair of sequence for sequence classification tasks Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens. by concatenating and adding special tokens.
A RoBERTa sequence has the following format: An XLNet sequence has the following format:
single sequence: <s> X </s> single sequence: X <sep> <cls>
pair of sequences: <s> A </s></s> B </s> pair of sequences: A <sep> B <sep> <cls>
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
...@@ -226,10 +226,10 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -226,10 +226,10 @@ class XLNetTokenizer(PreTrainedTokenizer):
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format: An XLNet sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
| first sequence | second sequence | CLS segment ID | first sequence | second sequence | CLS segment ID
if token_ids_1 is None, only returns the first portion of the mask (0's). if token_ids_1 is None, only returns the first portion of the mask (0's).
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
......
''' Script for downloading all GLUE data.
Original source: https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e
Note: for legal reasons, we are unable to host MRPC.
You can either use the version hosted by the SentEval team, which is already tokenized,
or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually.
For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example).
You should then rename and place specific files in a folder (see below for an example).
mkdir MRPC
cabextract MSRParaphraseCorpus.msi -d MRPC
cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt
cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt
rm MRPC/_*
rm MSRParaphraseCorpus.msi
1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now.
2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray!
'''
import os
import sys
import shutil
import argparse
import tempfile
import urllib.request
import zipfile
TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
TASK2PATH = {"CoLA":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4',
"SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',
"MRPC":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc',
"QQP":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5',
"STS":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5',
"MNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce',
"SNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df',
"QNLI": 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601',
"RTE":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb',
"WNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf',
"diagnostic":'https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D'}
MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt'
MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'
def download_and_extract(task, data_dir):
print("Downloading and extracting %s..." % task)
data_file = "%s.zip" % task
urllib.request.urlretrieve(TASK2PATH[task], data_file)
with zipfile.ZipFile(data_file) as zip_ref:
zip_ref.extractall(data_dir)
os.remove(data_file)
print("\tCompleted!")
def format_mrpc(data_dir, path_to_data):
print("Processing MRPC...")
mrpc_dir = os.path.join(data_dir, "MRPC")
if not os.path.isdir(mrpc_dir):
os.mkdir(mrpc_dir)
if path_to_data:
mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt")
mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt")
else:
print("Local MRPC data not specified, downloading data from %s" % MRPC_TRAIN)
mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
urllib.request.urlretrieve(MRPC_TRAIN, mrpc_train_file)
urllib.request.urlretrieve(MRPC_TEST, mrpc_test_file)
assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file
assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file
urllib.request.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv"))
dev_ids = []
with open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding="utf8") as ids_fh:
for row in ids_fh:
dev_ids.append(row.strip().split('\t'))
with open(mrpc_train_file, encoding="utf8") as data_fh, \
open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding="utf8") as train_fh, \
open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding="utf8") as dev_fh:
header = data_fh.readline()
train_fh.write(header)
dev_fh.write(header)
for row in data_fh:
label, id1, id2, s1, s2 = row.strip().split('\t')
if [id1, id2] in dev_ids:
dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
else:
train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
with open(mrpc_test_file, encoding="utf8") as data_fh, \
open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding="utf8") as test_fh:
header = data_fh.readline()
test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
for idx, row in enumerate(data_fh):
label, id1, id2, s1, s2 = row.strip().split('\t')
test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
print("\tCompleted!")
def download_diagnostic(data_dir):
print("Downloading and extracting diagnostic...")
if not os.path.isdir(os.path.join(data_dir, "diagnostic")):
os.mkdir(os.path.join(data_dir, "diagnostic"))
data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv")
urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file)
print("\tCompleted!")
return
def get_tasks(task_names):
task_names = task_names.split(',')
if "all" in task_names:
tasks = TASKS
else:
tasks = []
for task_name in task_names:
assert task_name in TASKS, "Task %s not found!" % task_name
tasks.append(task_name)
return tasks
def main(arguments):
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data')
parser.add_argument('--tasks', help='tasks to download data for as a comma separated string',
type=str, default='all')
parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt',
type=str, default='')
args = parser.parse_args(arguments)
if not os.path.isdir(args.data_dir):
os.mkdir(args.data_dir)
tasks = get_tasks(args.tasks)
for task in tasks:
if task == 'MRPC':
format_mrpc(args.data_dir, args.path_to_mrpc)
elif task == 'diagnostic':
download_diagnostic(args.data_dir)
else:
download_and_extract(task, args.data_dir)
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))
---
- step:
name: Execute python examples/run_glue.py
image: pytorch/pytorch:nightly-devel-cuda10.0-cudnn7
command:
- python /valohai/repository/utils/download_glue_data.py --data_dir=/glue_data
- pip install -e .
- pip install -r examples/requirements.txt
- python examples/run_glue.py --do_train --data_dir=/glue_data/{parameter-value:task_name} {parameters}
parameters:
- name: model_type
pass-as: --model_type={v}
type: string
default: bert
- name: model_name_or_path
pass-as: --model_name_or_path={v}
type: string
default: bert-base-uncased
- name: task_name
pass-as: --task_name={v}
type: string
default: MRPC
- name: max_seq_length
pass-as: --max_seq_length={v}
description: The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.
type: integer
default: 128
- name: per_gpu_train_batch_size
pass-as: --per_gpu_train_batch_size={v}
description: Batch size per GPU/CPU for training.
type: integer
default: 8
- name: per_gpu_eval_batch_size
pass-as: --per_gpu_eval_batch_size={v}
description: Batch size per GPU/CPU for evaluation.
type: integer
default: 8
- name: gradient_accumulation_steps
pass-as: --gradient_accumulation_steps={v}
description: Number of updates steps to accumulate before performing a backward/update pass.
type: integer
default: 1
- name: learning_rate
pass-as: --learning_rate={v}
description: The initial learning rate for Adam.
type: float
default: 0.00005
- name: adam_epsilon
pass-as: --adam_epsilon={v}
description: Epsilon for Adam optimizer.
type: float
default: 0.00000001
- name: max_grad_norm
pass-as: --max_grad_norm={v}
description: Max gradient norm.
type: float
default: 1.0
- name: num_train_epochs
pass-as: --num_train_epochs={v}
description: Total number of training epochs to perform.
type: integer
default: 3
- name: max_steps
pass-as: --max_steps={v}
description: If > 0, set total number of training steps to perform. Override num_train_epochs.
type: integer
default: -1
- name: warmup_steps
pass-as: --warmup_steps={v}
description: Linear warmup over warmup_steps.
type: integer
default: -1
- name: logging_steps
pass-as: --logging_steps={v}
description: Log every X updates steps.
type: integer
default: 25
- name: save_steps
pass-as: --save_steps={v}
description: Save checkpoint every X updates steps.
type: integer
default: -1
- name: output_dir
pass-as: --output_dir={v}
type: string
default: /valohai/outputs
- name: evaluate_during_training
description: Run evaluation during training at each logging step.
type: flag
default: true
- name: do_lower_case
description: Set this flag if you are using an uncased model.
type: flag
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment