Commit f51ceb7c authored by Mohammad Shoeybi's avatar Mohammad Shoeybi
Browse files

c++ code working

parent 6140718f
......@@ -8,7 +8,7 @@ import torch
from torch.utils.data import Dataset
from dataset_utils import build_training_sample
#from data.mapping import build_training_samples_mapping
class AlbertDataSet(Dataset):
......@@ -57,7 +57,7 @@ class AlbertDataSet(Dataset):
self.mask_id, self.pad_id,
self.masked_lm_prob, rng)
'''
def get_target_seq_length(max_num_tokens, short_seq_prob, np_rng):
"""With probability `short_seq_prob` generate a smaller sequence lenght."""
if np_rng.random() < short_seq_prob:
......@@ -169,7 +169,7 @@ def build_training_samples_mapping(indexed_dataset, num_epochs, max_seq_length,
print('****************************************************************\n')
return samples_np
'''
# WILL BE REPLACED WITH JARED'S
class JaredDataset(object):
......@@ -207,7 +207,7 @@ if __name__ == '__main__':
sentences.extend(sent)
yield sentences
input_file = '/raid/mshoeybi/data/albert/sample/samples_1000.json'
input_file = '/raid/mshoeybi/data/albert/sample/samples_11.json'
vocab_file = '/raid/mshoeybi/data/albert/bert_vocab/vocab.txt'
tokenizer = FullTokenizer(vocab_file, do_lower_case=True)
......@@ -236,6 +236,55 @@ if __name__ == '__main__':
for i in range(1, len(doc_idx)):
doc_idx[i] += doc_idx[i-1]
#max_size = np.iinfo(np.int32).max // 32
import time
docs_np = np.array(doc_idx, dtype=np.uint32)
sizes_np = np.array(sizes, dtype=np.uint16)
start_time = time.time()
max_seq_length = 512
max_size = docs_np.shape[0]
lens = np.full(max_size, max_seq_length-3, dtype=np.uint16)
lens_rand = np.random.randint(low=2, high=(max_seq_length-2),
size=max_size//10, dtype=np.uint16)
lens_view = lens[:max_size//10]
np.copyto(lens_view, lens_rand)
np.random.shuffle(lens)
print('num docs', max_size)
print('lens time', time.time() - start_time)
import helpers
start_time = time.time()
maps = helpers.build_mapping(docs_np, sizes_np, 10, 100, 509, 0.1, 1234)
print('maps time', time.time() - start_time)
print(maps)
exit()
start_time = time.time()
max_size = 10 #np.iinfo(np.int32).max 32
docs = np.arange(10, dtype=np.uint32)
print(docs)
a = example.doit(docs, max_size)
print(type(a))
print(a.shape)
print(a)
print(time.time() - start_time)
exit()
#start_time = time.time()
count = doit(maps, docs_np, sizes_np, lens,docs_np.shape[0]-1, 10)
print(count)
maps = maps[:count]
np.random.shuffle(maps)
print(time.time() - start_time)
exit()
indexed_dataset = JaredDataset(doc_idx, sizes, sentences_list)
dataset = AlbertDataSet(indexed_dataset=indexed_dataset,
tokenizer=tokenizer,
......
#include <algorithm>
#include <iostream>
#include <limits>
#include <math.h>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
namespace py = pybind11;
using namespace std;
inline uint32_t get_sample_len(const int short_seq_ratio,
const uint32_t max_length) {
/* Training sample length. */
const auto random_number = rand();
if ((random_number % short_seq_ratio) == 0) {
return 2 + random_number % (max_length - 1);
}
return max_length;
}
py::array_t<uint32_t> build_mapping(const py::array_t<uint32_t>& docs_,
const py::array_t<uint16_t>& sizes_,
const int num_epochs,
const int max_num_samples,
const int max_seq_length,
const double short_seq_prob,
const int seed) {
cout << "> building dataset mapping for " << docs_.shape(0) - 1 <<
" documents with " << sizes_.shape(0) << " sentences ..." << endl;
// For efficiency, convert probability to ratio.
const int short_seq_ratio = int(round(1.0 / short_seq_prob));
// Remove bound checks.
auto docs = docs_.unchecked<1>();
auto sizes = sizes_.unchecked<1>();
// Check for consistency.
if (docs[docs.shape(0) - 1] != sizes.shape(0)) {
cout << "document values is not consistent with length of sizes: " <<
docs[docs.shape(0) - 1] << " != " << sizes.shape(0) << endl;
throw(-1);
}
// Mapping and it's length (1D).
int num_samples = -1;
uint32_t* maps = NULL;
// Perform two iterations, in the first iteration get the size
// and allocate memory and in the second iteration populate the map.
bool second = false;
for (int iteration=0; iteration < 2; ++iteration) {
// Set the seed so both iterations produce the same results.
srand(seed);
// Set the flag on second iteration.
if (iteration == 1) {
second = true;
}
// Counters:
uint32_t empty_docs = 0;
uint32_t one_sent_docs = 0;
// Current map index.
uint64_t map_index = 0;
// For each epoch:
for (int epoch=0; epoch < num_epochs; ++epoch) {
if (map_index >= max_num_samples) {
cout << " > reached " << max_num_samples << " samples after " <<
epoch << " epochs ..." << endl;
break;
}
// For each document:
for (int doc=0; doc < (docs.shape(0) - 1); ++doc) {
// Document sentences are in [sent_index_first, sent_index_last).
const uint32_t sent_index_first = docs[doc];
const uint32_t sent_index_last = docs[doc + 1];
// At the begining of the document previous index is the start index.
uint32_t prev_start_index = sent_index_first;
// Remaining documents.
uint32_t num_remain_sent = sent_index_last - sent_index_first;
// Some bookkeeping
if ((epoch == 0) && (!second)) {
if (num_remain_sent == 0) {
cout << "***WARNING*** document " << doc << " is empty" << endl;
empty_docs += 1;
}
if (num_remain_sent == 1) {
cout << "***WARNING*** document " << doc <<
" has one sentence" << endl;
one_sent_docs += 1;
}
}
// If we have more than two sentences.
if (num_remain_sent > 1) {
// Set values.
uint32_t size = 0;
uint32_t num_sent = 0;
uint32_t seq_len = get_sample_len(short_seq_ratio, max_seq_length);
// Loop through sentences.
for (uint32_t sent_index=sent_index_first;
sent_index < sent_index_last; ++sent_index) {
// Add the size and number of sentences.
size += sizes[sent_index];
num_sent += 1;
num_remain_sent -= 1;
// If we have reached the target length.
// and if not only one sentence is left in the document.
// and if we have at least two sentneces.
// and if we have reached end of the document.
if (((size >= seq_len) && (num_remain_sent > 1) &&
(num_sent > 1) ) || (num_remain_sent == 0)) {
// Populate the map.
if (second) {
const uint64_t map_index_0 = 3 * map_index;
maps[map_index_0] = prev_start_index;
maps[map_index_0 + 1] = sent_index + 1;
maps[map_index_0 + 2] = seq_len;
}
// Update indices / counters.
map_index += 1;
prev_start_index = sent_index + 1;
seq_len = get_sample_len(short_seq_ratio, max_seq_length);
size = 0;
num_sent = 0;
}
}
} // if (num_remain_sent > 1) {
} // for (int doc=0; doc < num_docs; ++doc) {
} // for (int epoch=0; epoch < num_epochs; ++epoch) {
// For now only support mappings up to MAX_INT.
if (map_index > std::numeric_limits<int>::max()) {
cout << "number of samples ("<< map_index <<") exceeded MAX_INT" << endl;
throw(-1);
}
else if (!second) {
cout << " number of samples: " <<
map_index << endl;
cout << " number of empty documents: " <<
empty_docs << endl;
cout << " number of documents with one sentence: " <<
one_sent_docs << endl;
maps = new uint32_t[3*map_index];
num_samples = int(map_index);
}
} // for (int iteration=0; iteration < 2; ++iteration) {
// Shuffle.
for (int i=(num_samples - 1); i > 0; --i) {
const int j = rand() % (i + 1);
uint64_t i0 = 3 * i;
uint64_t j0 = 3 * j;
// Swap values.
swap(maps[i0], maps[j0]);
swap(maps[i0 + 1], maps[j0 + 1]);
swap(maps[i0 + 2], maps[j0 + 2]);
}
cout << " > done building the mapping." << endl;
// Method to deallocate memory.
py::capsule free_when_done(maps, [](void *mem_) {
uint32_t *mem = reinterpret_cast<uint32_t *>(mem_);
cout << "freeing memory for the dataset mapping" << endl;
delete[] mem;
});
// Return the numpy array.
return py::array_t<uint32_t>({num_samples, 3}, // shape
{3*4, 4}, // C-style contiguous strides
maps, // the data pointer
free_when_done); // numpy array references
}
PYBIND11_MODULE(helpers, m) {
m.def("build_mapping", &build_mapping);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment