Merge pull request #1480 from huggingface/fix_ctrl_tokenizer

Fixing CTRL tokenizer - Update error messages - XLM-MLM in run_generation

Merge pull request #1480 from huggingface/fix_ctrl_tokenizer
Fixing CTRL tokenizer - Update error messages - XLM-MLM in run_generation
4bc46011 · Thomas Wolf · GitHub · 036483fa · 177a7212 · 4bc46011
Unverified Commit 4bc46011 authored Oct 10, 2019 by Thomas Wolf Committed by GitHub Oct 10, 2019
5 changed files
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -107,7 +107,8 @@ def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')
    return logits


-def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0, is_xlnet=False, xlm_lang=None, device='cpu'):
+def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0,
+                    is_xlnet=False, is_xlm_mlm=False, xlm_mask_token=None, xlm_lang=None, device='cpu'):
    context = torch.tensor(context, dtype=torch.long, device=device)
    context = context.unsqueeze(0).repeat(num_samples, 1)
    generated = context
@@ -125,10 +126,16 @@ def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=
                target_mapping[0, 0, -1] = 1.0  # predict last token
                inputs = {'input_ids': input_ids, 'perm_mask': perm_mask, 'target_mapping': target_mapping}

+            if is_xlm_mlm and xlm_mask_token:
+                # XLM MLM models are direct models (predict same token, not next token)
+                # => need one additional dummy token in the input (will be masked and guessed)
+                input_ids = torch.cat((generated, torch.full((1, 1), xlm_mask_token, dtype=torch.long, device=device)), dim=1)
+                inputs = {'input_ids': input_ids}
+
            if xlm_lang is not None:
                inputs["langs"] = torch.tensor([xlm_lang] * inputs["input_ids"].shape[1], device=device).view(1, -1)

-            outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
+            outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet/CTRL (cached hidden-states)
            next_token_logits = outputs[0][0, -1, :] / (temperature if temperature > 0 else 1.)

            # reptition penalty from CTRL (https://arxiv.org/abs/1909.05858)
@@ -167,10 +174,7 @@ def main():
    parser.add_argument('--stop_token', type=str, default=None,
                        help="Token at which text generation is stopped")
    args = parser.parse_args()
-    if args.model_type in ["ctrl"]:
-        if args.temperature > 0.7 : 
-            print('CTRL typically works better with lower temperatures (and lower top_k).')
-        
+
    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()

@@ -190,7 +194,11 @@ def main():
    elif args.length < 0:
        args.length = MAX_LENGTH  # avoid infinite loop

-    print(args)
+    logger.info(args)
+    if args.model_type in ["ctrl"]:
+        if args.temperature > 0.7 : 
+            logger.info('CTRL typically works better with lower temperatures (and lower top_k).')
+
    while True:
        xlm_lang = None
        # XLM Language usage detailed in the issues #1414
@@ -204,6 +212,13 @@ def main():
                    language = input("Using XLM. Select language in " + str(list(tokenizer.lang2id.keys())) + " >>> ")
            xlm_lang = tokenizer.lang2id[language]

+        # XLM masked-language modeling (MLM) models need masked token (see details in sample_sequence)
+        is_xlm_mlm = args.model_type in ["xlm"] and 'mlm' in args.model_name_or_path
+        if is_xlm_mlm:
+            xlm_mask_token = tokenizer.mask_token_id
+        else:
+            xlm_mask_token = None
+
        raw_text = args.prompt if args.prompt else input("Model prompt >>> ")
        if args.model_type in ["transfo-xl", "xlnet"]:
            # Models with memory likes to have a long prompt for short inputs.
@@ -218,6 +233,8 @@ def main():
            top_p=args.top_p,
            repetition_penalty=args.repetition_penalty,
            is_xlnet=bool(args.model_type == "xlnet"),
+            is_xlm_mlm=is_xlm_mlm,
+            xlm_mask_token=xlm_mask_token,
            xlm_lang=xlm_lang,
            device=args.device,
        )

--- a/transformers/configuration_utils.py
+++ b/transformers/configuration_utils.py
@@ -130,20 +130,19 @@ class PretrainedConfig(object):
        # redirect to the cache, if necessary
        try:
            resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
-        except EnvironmentError as e:
+        except EnvironmentError:
            if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
-                logger.error(
-                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
-                        config_file))
+                msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
+                        config_file)
            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find any file "
-                    "associated to this path or url.".format(
+                msg = "Model name '{}' was not found in model name list ({}). " \
+                      "We assumed '{}' was a path or url to a configuration file named {} or " \
+                      "a directory containing such a file but couldn't find any such file at this path or url.".format(
                        pretrained_model_name_or_path,
                        ', '.join(cls.pretrained_config_archive_map.keys()),
-                        config_file))
-            raise e
+                        config_file, CONFIG_NAME)
+            raise EnvironmentError(msg)
+
        if resolved_config_file == config_file:
            logger.info("loading configuration file {}".format(config_file))
        else:

--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -316,20 +316,20 @@ class PreTrainedModel(nn.Module):
            # redirect to the cache, if necessary
            try:
                resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
-            except EnvironmentError as e:
+            except EnvironmentError:
                if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
-                    logger.error(
-                        "Couldn't reach server at '{}' to download pretrained weights.".format(
-                            archive_file))
+                    msg = "Couldn't reach server at '{}' to download pretrained weights.".format(
+                            archive_file)
                else:
-                    logger.error(
-                        "Model name '{}' was not found in model name list ({}). "
-                        "We assumed '{}' was a path or url but couldn't find any file "
-                        "associated to this path or url.".format(
+                    msg = "Model name '{}' was not found in model name list ({}). " \
+                        "We assumed '{}' was a path or url to model weight files named one of {} but " \
+                        "couldn't find any such file at this path or url.".format(
                            pretrained_model_name_or_path,
                            ', '.join(cls.pretrained_model_archive_map.keys()),
-                            archive_file))
-                raise e
+                            archive_file,
+                            [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME])
+                raise EnvironmentError(msg)
+
            if resolved_archive_file == archive_file:
                logger.info("loading weights file {}".format(archive_file))
            else:

--- a/transformers/tokenization_ctrl.py
+++ b/transformers/tokenization_ctrl.py
@@ -22,8 +22,6 @@ import os
 import regex as re
 from io import open

-from .tokenization_bert import BasicTokenizer
-
 from .tokenization_utils import PreTrainedTokenizer

 logger = logging.getLogger(__name__)
@@ -48,39 +46,11 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    'ctrl': 256,
 }

-def text_standardize(text):
-    """
-    fixes some issues the spacy tokenizer had on books corpus
-    also does some whitespace standardization
-    """
-    text = text.replace('—', '-')
-    text = text.replace('–', '-')
-    text = text.replace('―', '-')
-    text = text.replace('…', '...')
-    text = text.replace('´', "'")
-    text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
-    text = re.sub(r'\s*\n\s*', ' \n ', text)
-    text = re.sub(r'[^\S\n]+', ' ', text)
-    return text.strip()
-
-
 def get_pairs(word):
    """Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
-    # pairs = []
-    # prev_char = word[0]
-    # for i, char in enumerate(word[1:]):
-    #     #_i = i + 1
-    #     #if word[_i+1:] == tuple('</w>'):
-    #     #    pairs.append((prev_char, char+'</w>'))
-    #     #    break
-    #     #else:
-    #     if True:
-    #         pairs.append((prev_char, char))
-    #         prev_char = char
-
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
@@ -166,7 +136,9 @@ class CTRLTokenizer(PreTrainedTokenizer):
        """ Tokenize a string.
        """
        split_tokens = []
+
        text = text.split(' ')
+
        for token in text:
            split_tokens.extend([t for t in self.bpe(token).split(' ')])
        return split_tokens

--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -337,13 +337,13 @@ class PreTrainedTokenizer(object):
                vocab_files[file_id] = full_file_name

            if all(full_file_name is None for full_file_name in vocab_files.values()):
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find tokenizer files"
-                    "at this path or url.".format(
+                raise EnvironmentError(
+                    "Model name '{}' was not found in tokenizers model name list ({}). "
+                    "We assumed '{}' was a path or url to a directory containing vocabulary files "
+                    "named {} but couldn't find such vocabulary files at this path or url.".format(
                        pretrained_model_name_or_path, ', '.join(s3_models),
-                        pretrained_model_name_or_path, ))
-                return None
+                        pretrained_model_name_or_path, 
+                        list(cls.vocab_files_names.values())))

        # Get files from url, cache, or disk depending on the case
        try:
@@ -353,17 +353,18 @@ class PreTrainedTokenizer(object):
                    resolved_vocab_files[file_id] = None
                else:
                    resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
-        except EnvironmentError as e:
+        except EnvironmentError:
            if pretrained_model_name_or_path in s3_models:
-                logger.error("Couldn't reach server to download vocabulary.")
+                msg = "Couldn't reach server at '{}' to download vocabulary files."
            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find files {} "
-                    "at this path or url.".format(
+                msg = "Model name '{}' was not found in tokenizers model name list ({}). " \
+                    "We assumed '{}' was a path or url to a directory containing vocabulary files " \
+                    "named {}, but couldn't find such vocabulary files at this path or url.".format(
                        pretrained_model_name_or_path, ', '.join(s3_models),
-                        pretrained_model_name_or_path, str(vocab_files.keys())))
-            raise e
+                        pretrained_model_name_or_path,
+                        list(cls.vocab_files_names.values()))
+
+            raise EnvironmentError(msg)

        for file_id, file_path in vocab_files.items():
            if file_path == resolved_vocab_files[file_id]: