Raise error when using an mlm flag for a clm model + correct TextDataset

f54a5bd3 · Lysandre · Julien Chaumond · 569897ce · f54a5bd3
Commit f54a5bd3 authored Feb 10, 2020 by Lysandre Committed by Julien Chaumond Feb 12, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 0 deletions

examples/run_language_modeling.py examples/run_language_modeling.py +9 -0

No files found.
--- a/examples/run_language_modeling.py
+++ b/examples/run_language_modeling.py
@@ -86,6 +86,9 @@ MODEL_CLASSES = {
 class TextDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512):
        assert os.path.isfile(file_path)
+        block_size = block_size - (tokenizer.max_len - tokenizer.max_len_single_sentence)
        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory, args.model_type + "_cached_lm_" + str(block_size) + "_" + filename
@@ -195,6 +198,12 @@ def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -
 def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args) -> Tuple[torch.Tensor, torch.Tensor]:
    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
+    if tokenizer.mask_token is None:
+        raise ValueError(
+            "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
+        )
    labels = inputs.clone()
    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
    probability_matrix = torch.full(labels.shape, args.mlm_probability)