modify double considering special tokens in `language_modeling.py` (#11275)

* Update language_modeling.py in "class TextDatasetForNextSentencePrediction(Dataset)", double considering "self.tokenizer.num_special_tokens_to_add(pair=True)" so, i remove self.block_size, and add parameter for "def create_examples_from_document". like "class LineByLineWithSOPTextDataset" do * Update language_modeling.py

modify double considering special tokens in `language_modeling.py` (#11275)
* Update language_modeling.py in "class TextDatasetForNextSentencePrediction(Dataset)", double considering "self.tokenizer.num_special_tokens_to_add(pair=True)" so, i remove self.block_size, and add parameter for "def create_examples_from_document". like "class LineByLineWithSOPTextDataset" do * Update language_modeling.py
3981ce3d · TAE YOUNGDON · GitHub · 5a34d8d9 · 3981ce3d
Unverified Commit 3981ce3d authored Apr 20, 2021 by TAE YOUNGDON Committed by GitHub Apr 19, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 4 deletions

src/transformers/data/datasets/language_modeling.py src/transformers/data/datasets/language_modeling.py +3 -4

No files found.
--- a/src/transformers/data/datasets/language_modeling.py
+++ b/src/transformers/data/datasets/language_modeling.py
@@ -354,7 +354,6 @@ class TextDatasetForNextSentencePrediction(Dataset):
        )
        assert os.path.isfile(file_path), f"Input file path {file_path} not found"
-        self.block_size = block_size - tokenizer.num_special_tokens_to_add(pair=True)
        self.short_seq_probability = short_seq_probability
        self.nsp_probability = nsp_probability
@@ -413,7 +412,7 @@ class TextDatasetForNextSentencePrediction(Dataset):
                logger.info(f"Creating examples from {len(self.documents)} documents.")
                self.examples = []
                for doc_index, document in enumerate(self.documents):
-                    self.create_examples_from_document(document, doc_index)
+                    self.create_examples_from_document(document, doc_index, block_size)
                start = time.time()
                with open(cached_features_file, "wb") as handle:
@@ -422,10 +421,10 @@ class TextDatasetForNextSentencePrediction(Dataset):
                    f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]"
                )
-    def create_examples_from_document(self, document: List[List[int]], doc_index: int):
+    def create_examples_from_document(self, document: List[List[int]], doc_index: int, block_size: int):
        """Creates examples for a single document."""
-        max_num_tokens = self.block_size - self.tokenizer.num_special_tokens_to_add(pair=True)
+        max_num_tokens = block_size - self.tokenizer.num_special_tokens_to_add(pair=True)
        # We *usually* want to fill up the entire sequence since we are padding
        # to `block_size` anyways, so short sequences are generally wasted