Commit 6140718f authored by Mohammad Shoeybi's avatar Mohammad Shoeybi
Browse files

before optimization

parent c125d247
...@@ -7,18 +7,55 @@ import numpy as np ...@@ -7,18 +7,55 @@ import numpy as np
import torch import torch
from torch.utils.data import Dataset from torch.utils.data import Dataset
from dataset_utils import build_training_sample
# WILL BE REPLACED WITH JARED'S
class JaredDataset(object):
def __init__(self, doc_idx, sizes, sentences): class AlbertDataSet(Dataset):
self.doc_idx = doc_idx
self.num_docs = len(self.doc_idx) - 1 def __init__(self, indexed_dataset, tokenizer, num_epochs,
self.sizes = sizes masked_lm_prob, max_seq_length, short_seq_prob, seed):
self.sentences = sentences
# Params to store.
self.seed = seed
self.masked_lm_prob = masked_lm_prob
self.max_seq_length = max_seq_length
# Indexed dataset.
self.indexed_dataset = indexed_dataset
# Build the samples mapping.
self.samples_mapping = build_training_samples_mapping(
indexed_dataset,
num_epochs,
self.max_seq_length,
short_seq_prob,
self.seed)
# Vocab stuff.
self.vocab_id_list = list(tokenizer.inv_vocab.keys())
self.vocab_id_to_token_dict = tokenizer.inv_vocab
self.cls_id = tokenizer.vocab['[CLS]']
self.sep_id = tokenizer.vocab['[SEP]']
self.mask_id = tokenizer.vocab['[MASK]']
self.pad_id = tokenizer.vocab['[PAD]']
def __len__(self):
return self.samples.shape[0]
def __getitem__(self, idx): def __getitem__(self, idx):
return self.sentences[idx] rng = random.Random(self.seed + idx)
start_index, end_index, seq_length = self.samples_mapping[idx]
sample = []
for index in range(start_index, end_index):
sample.append(self.indexed_dataset[index])
return build_training_sample(sample, seq_length,
self.max_seq_length,
self.vocab_id_list,
self.vocab_id_to_token_dict,
self.cls_id, self.sep_id,
self.mask_id, self.pad_id,
self.masked_lm_prob, rng)
def get_target_seq_length(max_num_tokens, short_seq_prob, np_rng): def get_target_seq_length(max_num_tokens, short_seq_prob, np_rng):
...@@ -87,6 +124,7 @@ def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length, ...@@ -87,6 +124,7 @@ def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length,
while sent_index < sent_index_last: while sent_index < sent_index_last:
# Get the size. # Get the size.
assert indexed_dataset.sizes[sent_index] > 0
size += indexed_dataset.sizes[sent_index] size += indexed_dataset.sizes[sent_index]
sent_index += 1 sent_index += 1
...@@ -133,51 +171,17 @@ def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length, ...@@ -133,51 +171,17 @@ def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length,
return samples_np return samples_np
class AlbertDataSet(Dataset): # WILL BE REPLACED WITH JARED'S
class JaredDataset(object):
def __init__(self, indexed_dataset, tokenizer, num_epochs,
masked_lm_prob, max_seq_length, short_seq_prob, seed):
# Params to store.
self.seed = seed
self.masked_lm_prob = masked_lm_prob
self.max_seq_length = max_seq_length
# Indexed dataset.
self.indexed_dataset = indexed_dataset
# Build the samples mapping.
self.samples_mapping = build_training_samples_mapping(
indexed_dataset,
num_epochs,
self.max_seq_length,
short_seq_prob,
self.seed)
# Vocab stuff.
self.vocab_id_list = list(tokenizer.inv_vocab.keys())
self.vocab_id_to_token_dict = tokenizer.inv_vocab
self.cls_id = tokenizer.vocab['[CLS]']
self.sep_id = tokenizer.vocab['[SEP]']
self.mask_id = tokenizer.vocab['[MASK]']
self.pad_id = tokenizer.vocab['[PAD]']
def __len__(self): def __init__(self, doc_idx, sizes, sentences):
return self.samples.shape[0] self.doc_idx = doc_idx
self.num_docs = len(self.doc_idx) - 1
self.sizes = sizes
self.sentences = sentences
def __getitem__(self, idx): def __getitem__(self, idx):
rng = random.Random(self.seed + idx) return self.sentences[idx]
start_index, end_index = self.samples_mapping[idx]
sample = []
for index in range(start_index, end_index):
sample.append(self.indexed_dataset[index])
return build_training_sample(sample, self.vocab_id_list,
self.vocab_id_to_token_dict,
self.cls_id, self.sep_id,
self.mask_id, self.pad_id,
self.masked_lm_prob, self.max_seq_length,
rng)
...@@ -198,10 +202,12 @@ if __name__ == '__main__': ...@@ -198,10 +202,12 @@ if __name__ == '__main__':
sentences = [] sentences = []
for line in text.split('\n'): for line in text.split('\n'):
if line != '\n': if line != '\n':
sentences.extend(nltk.tokenize.sent_tokenize(line)) sent = nltk.tokenize.sent_tokenize(line)
if sent:
sentences.extend(sent)
yield sentences yield sentences
input_file = '/raid/mshoeybi/data/albert/sample/samples_11.json' input_file = '/raid/mshoeybi/data/albert/sample/samples_1000.json'
vocab_file = '/raid/mshoeybi/data/albert/bert_vocab/vocab.txt' vocab_file = '/raid/mshoeybi/data/albert/bert_vocab/vocab.txt'
tokenizer = FullTokenizer(vocab_file, do_lower_case=True) tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
...@@ -212,19 +218,28 @@ if __name__ == '__main__': ...@@ -212,19 +218,28 @@ if __name__ == '__main__':
sentences_list = [] sentences_list = []
for sentences in document_generator: for sentences in document_generator:
doc_idx.append(len(sentences)) num_sent = 0
for sentence in sentences: for sentence in sentences:
tokens = tokenizer.tokenize(sentence) tokens = tokenizer.tokenize(sentence)
ids = tokenizer.convert_tokens_to_ids(tokens) if tokens:
sizes.append(len(ids)) ids = tokenizer.convert_tokens_to_ids(tokens)
sentences_list.append(ids) if len(ids) == 0:
print('****************')
print(sentence)
print(tokens)
print(ids)
print('****************')
sizes.append(len(ids))
sentences_list.append(ids)
num_sent += 1
doc_idx.append(num_sent)
for i in range(1, len(doc_idx)): for i in range(1, len(doc_idx)):
doc_idx[i] += doc_idx[i-1] doc_idx[i] += doc_idx[i-1]
indexed_dataset = JaredDataset(doc_idx, sizes, sentences_list) indexed_dataset = JaredDataset(doc_idx, sizes, sentences_list)
dataset = AlbertDataSet(indexed_dataset=indexed_dataset, dataset = AlbertDataSet(indexed_dataset=indexed_dataset,
tokenizer=tokenizer, tokenizer=tokenizer,
num_epochs=3, num_epochs=10,
masked_lm_prob=0.15, masked_lm_prob=0.15,
max_seq_length=512, max_seq_length=512,
short_seq_prob=0.1, short_seq_prob=0.1,
......
...@@ -5,13 +5,18 @@ import collections ...@@ -5,13 +5,18 @@ import collections
import numpy as np import numpy as np
def build_training_sample(sample, vocab_id_list, vocab_id_to_token_dict, def build_training_sample(sample,
target_seq_length, max_seq_length,
vocab_id_list, vocab_id_to_token_dict,
cls_id, sep_id, mask_id, pad_id, cls_id, sep_id, mask_id, pad_id,
masked_lm_prob, max_seq_length, rng): masked_lm_prob, rng):
"""Biuld training sample. """Biuld training sample.
Arguments: Arguments:
sample: A list of sentences in which each sentence is a list token ids. sample: A list of sentences in which each sentence is a list token ids.
target_seq_length: Desired sequence length.
max_seq_length: Maximum length of the sequence. All values are padded to
this length.
vocab_id_list: List of vocabulary ids. Used to pick a random id. vocab_id_list: List of vocabulary ids. Used to pick a random id.
vocab_id_to_token_dict: A dictionary from vocab ids to text tokens. vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.
cls_id: Start of example id. cls_id: Start of example id.
...@@ -19,20 +24,19 @@ def build_training_sample(sample, vocab_id_list, vocab_id_to_token_dict, ...@@ -19,20 +24,19 @@ def build_training_sample(sample, vocab_id_list, vocab_id_to_token_dict,
mask_id: Mask token id. mask_id: Mask token id.
pad_id: Padding token id. pad_id: Padding token id.
masked_lm_prob: Probability to mask tokens. masked_lm_prob: Probability to mask tokens.
max_seq_length: Maximum length of the sequence. All values are padded to
this length.
rng: Random number genenrator. rng: Random number genenrator.
""" """
# We assume that we have at least two sentences in the sample # We assume that we have at least two sentences in the sample
assert len(sample) > 1 assert len(sample) > 1
assert target_seq_length <= max_seq_length
# Divide sample into two segments (A and B). # Divide sample into two segments (A and B).
tokens_a, tokens_b, is_next_random = get_a_and_b_segments(sample, rng) tokens_a, tokens_b, is_next_random = get_a_and_b_segments(sample, rng)
# Truncate to `max_sequence_length`. # Truncate to `target_sequence_length`.
# Note that we have account for [CLS] A [SEP] B [SEP] # Note that we have account for [CLS] A [SEP] B [SEP]
max_num_tokens = max_seq_length - 3 max_num_tokens = target_seq_length - 3
truncate_segments(tokens_a, tokens_b, len(tokens_a), len(tokens_b), truncate_segments(tokens_a, tokens_b, len(tokens_a), len(tokens_b),
max_num_tokens, rng) max_num_tokens, rng)
...@@ -421,11 +425,13 @@ if __name__ == '__main__': ...@@ -421,11 +425,13 @@ if __name__ == '__main__':
for s in samples[0]: for s in samples[0]:
sample.append(tokenizer.convert_tokens_to_ids(s)) sample.append(tokenizer.convert_tokens_to_ids(s))
max_seq_length = 512 max_seq_length = 512
target_seq_length = 444
masked_lm_prob = 0.15 masked_lm_prob = 0.15
example = build_training_sample(sample, example = build_training_sample(sample,
target_seq_length, max_seq_length,
vocab_id_list, vocab_id_to_token_dict, vocab_id_list, vocab_id_to_token_dict,
cls_id, sep_id, mask_id, pad_id, cls_id, sep_id, mask_id, pad_id,
masked_lm_prob, max_seq_length, rng) masked_lm_prob, rng)
orig_tokens = [] orig_tokens = []
for s in samples[0]: for s in samples[0]:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment