minor spring cleaning - missing configs + processing

ebba9e92 · VictorSanh · b1e1a9f9 · ebba9e92 · ebba9e92 · ebba9e92
Commit ebba9e92 authored Jan 10, 2020 by VictorSanh
3 changed files
--- a/examples/distillation/lm_seqs_dataset.py
+++ b/examples/distillation/lm_seqs_dataset.py
@@ -42,6 +42,7 @@ class LmSeqsDataset(Dataset):
        self.check()
        self.remove_long_sequences()
        self.remove_empty_sequences()
+        self.remove_unknown_sequences()
        self.check()
        self.print_statistics()
@@ -109,6 +110,22 @@ class LmSeqsDataset(Dataset):
        new_size = len(self)
        logger.info(f"Remove {init_size - new_size} too short (<=11 tokens) sequences.")
+    def remove_unknown_sequences(self):
+        """
+        Remove sequences with a (too) high level of unknown tokens.
+        """
+        if 'unk_token' not in self.params.special_tok_ids:
+            return
+        else:
+            unk_token_id = self.params.special_tok_ids['unk_token']
+        init_size = len(self)
+        unk_occs = np.array([np.count_nonzero(a == unk_token_id) for a in self.token_ids])
+        indices = (unk_occs/self.lengths) < 0.5
+        self.token_ids = self.token_ids[indices]
+        self.lengths = self.lengths[indices]
+        new_size = len(self)
+        logger.info(f'Remove {init_size - new_size} sequences with a high level of unknown tokens (50%).')
    def print_statistics(self):
        """
        Print some statistics on the corpus. Only the master process.

--- a/examples/distillation/training_configs/distilbert-base-multilingual-cased.json
+++ b/examples/distillation/training_configs/distilbert-base-multilingual-cased.json
+{
+	"activation": "gelu",
+	"attention_dropout": 0.1,
+	"dim": 768,
+	"dropout": 0.1,
+	"hidden_dim": 3072,
+	"initializer_range": 0.02,
+	"max_position_embeddings": 512,
+	"n_heads": 12,
+	"n_layers": 6,
+	"sinusoidal_pos_embds": true,
+	"tie_weights_": true,
+	"vocab_size": 119547
+  }
\ No newline at end of file
--- a/examples/distillation/training_configs/distilroberta-base.json
+++ b/examples/distillation/training_configs/distilroberta-base.json
+{
+    "vocab_size": 50265,
+    "hidden_size": 768,
+    "num_hidden_layers": 6,
+    "num_attention_heads": 12,
+    "intermediate_size": 3072,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "attention_probs_dropout_prob": 0.1,
+    "max_position_embeddings": 514,
+    "type_vocab_size": 1,
+    "initializer_range": 0.02,
+    "layer_norm_eps": 0.00001
+}
\ No newline at end of file