Fix tf random token masking probability in data collator (#21834)

* fix tf random mask tokens probability * fix tf random mask tokens probability in collator for langauge modelling

Fix tf random token masking probability in data collator (#21834)
* fix tf random mask tokens probability * fix tf random mask tokens probability in collator for langauge modelling
2d506ea4 · anruijian · GitHub · 4fe744f5 · 2d506ea4
Unverified Commit 2d506ea4 authored Feb 28, 2023 by anruijian Committed by GitHub Feb 28, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 2 deletions

src/transformers/data/data_collator.py src/transformers/data/data_collator.py +2 -2

No files found.
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -679,7 +679,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
        inputs = tf.where(indices_replaced, mask_token_id, inputs)
        # 10% of the time, we replace masked input tokens with random word
-        indices_random = self.tf_bernoulli(input_shape, 0.1) & masked_indices & ~indices_replaced
+        indices_random = self.tf_bernoulli(input_shape, 0.5) & masked_indices & ~indices_replaced
        random_words = tf.random.uniform(input_shape, maxval=vocab_size, dtype=tf.int64)
        inputs = tf.where(indices_random, random_words, inputs)
@@ -1062,7 +1062,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
        inputs = tf.where(indices_replaced, self.tokenizer.mask_token_id, inputs)
        # 10% of the time, we replace masked input tokens with random word
-        indices_random = self.tf_bernoulli(input_shape, 0.1) & masked_indices & ~indices_replaced
+        indices_random = self.tf_bernoulli(input_shape, 0.5) & masked_indices & ~indices_replaced
        random_words = tf.random.uniform(input_shape, maxval=len(self.tokenizer), dtype=tf.int64)
        inputs = tf.where(indices_random, random_words, inputs)