Commit ebba9e92 authored by VictorSanh's avatar VictorSanh
Browse files

minor spring cleaning - missing configs + processing

parent b1e1a9f9
...@@ -42,6 +42,7 @@ class LmSeqsDataset(Dataset): ...@@ -42,6 +42,7 @@ class LmSeqsDataset(Dataset):
self.check() self.check()
self.remove_long_sequences() self.remove_long_sequences()
self.remove_empty_sequences() self.remove_empty_sequences()
self.remove_unknown_sequences()
self.check() self.check()
self.print_statistics() self.print_statistics()
...@@ -109,6 +110,22 @@ class LmSeqsDataset(Dataset): ...@@ -109,6 +110,22 @@ class LmSeqsDataset(Dataset):
new_size = len(self) new_size = len(self)
logger.info(f"Remove {init_size - new_size} too short (<=11 tokens) sequences.") logger.info(f"Remove {init_size - new_size} too short (<=11 tokens) sequences.")
def remove_unknown_sequences(self):
"""
Remove sequences with a (too) high level of unknown tokens.
"""
if 'unk_token' not in self.params.special_tok_ids:
return
else:
unk_token_id = self.params.special_tok_ids['unk_token']
init_size = len(self)
unk_occs = np.array([np.count_nonzero(a == unk_token_id) for a in self.token_ids])
indices = (unk_occs/self.lengths) < 0.5
self.token_ids = self.token_ids[indices]
self.lengths = self.lengths[indices]
new_size = len(self)
logger.info(f'Remove {init_size - new_size} sequences with a high level of unknown tokens (50%).')
def print_statistics(self): def print_statistics(self):
""" """
Print some statistics on the corpus. Only the master process. Print some statistics on the corpus. Only the master process.
......
{
"activation": "gelu",
"attention_dropout": 0.1,
"dim": 768,
"dropout": 0.1,
"hidden_dim": 3072,
"initializer_range": 0.02,
"max_position_embeddings": 512,
"n_heads": 12,
"n_layers": 6,
"sinusoidal_pos_embds": true,
"tie_weights_": true,
"vocab_size": 119547
}
\ No newline at end of file
{
"vocab_size": 50265,
"hidden_size": 768,
"num_hidden_layers": 6,
"num_attention_heads": 12,
"intermediate_size": 3072,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"attention_probs_dropout_prob": 0.1,
"max_position_embeddings": 514,
"type_vocab_size": 1,
"initializer_range": 0.02,
"layer_norm_eps": 0.00001
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment