helpers.cpp 8.25 KB
Newer Older
Mohammad Shoeybi's avatar
Mohammad Shoeybi committed
1
2
3
4
5

#include <algorithm>
#include <iostream>
#include <limits>
#include <math.h>
6
#include <stdexcept>
Mohammad Shoeybi's avatar
Mohammad Shoeybi committed
7
8
9
10
11
12
13
14
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>

namespace py = pybind11;
using namespace std;


inline uint32_t get_sample_len(const int short_seq_ratio,
15
16
17
18
19
20
21
                               const uint32_t max_length) {
    /* Training sample length. */
    const auto random_number = rand();
    if ((random_number % short_seq_ratio) == 0) {
        return 2 + random_number % (max_length - 1);
    }
    return max_length;
Mohammad Shoeybi's avatar
Mohammad Shoeybi committed
22
23
}

24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
template<typename DocIdx>
py::array build_mapping_impl(const py::array_t<uint32_t>& docs_,
                             const py::array_t<uint16_t>& sizes_,
                             const int num_epochs,
                             const uint64_t max_num_samples,
                             const int max_seq_length,
                             const double short_seq_prob,
                             const int seed) {

    cout << "> building dataset mapping for " << docs_.shape(0) - 1 <<
            " documents with " << sizes_.shape(0) << " sentences ..." << endl;

    // For efficiency, convert probability to ratio.
    const auto short_seq_ratio = static_cast<int>(round(1.0 / short_seq_prob));

    // Remove bound checks.
    auto docs = docs_.unchecked<1>();
    auto sizes = sizes_.unchecked<1>();

    // Check for consistency.
    if (docs[docs.shape(0) - 1] != sizes.shape(0)) {
        cout << "document values is not consistent with length of sizes: " <<
                docs[docs.shape(0) - 1] << " != " << sizes.shape(0) << endl;
        throw(-1);
    }
Mohammad Shoeybi's avatar
Mohammad Shoeybi committed
49

50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
    // Mapping and it's length (1D).
    int64_t num_samples = -1;
    DocIdx* maps = NULL;

    // Perform two iterations, in the first iteration get the size
    // and allocate memory and in the second iteration populate the map.
    bool second = false;
    for (int iteration=0; iteration < 2; ++iteration) {

        // Set the seed so both iterations produce the same results.
        srand(seed);

        // Set the flag on second iteration.
        second = iteration == 1;

        // Counters:
        uint32_t empty_docs = 0;
        uint32_t one_sent_docs = 0;

        // Current map index.
        uint64_t map_index = 0;

        // For each epoch:
        for (int epoch=0; epoch < num_epochs; ++epoch) {
            if (map_index >= max_num_samples && !second) {
                cout << " > reached " << max_num_samples << " samples after " <<
                        epoch << " epochs ..." << endl;
                break;
            }
            // For each document:
            for (int doc=0; doc < (docs.shape(0) - 1); ++doc) {

                // Document sentences are in [sent_index_first, sent_index_last).
                const auto sent_index_first = docs[doc];
                const auto sent_index_last = docs[doc + 1];

                // At the begining of the document previous index is the start index.
                auto prev_start_index = sent_index_first;

                // Remaining documents.
                auto num_remain_sent = sent_index_last - sent_index_first;

                // Some bookkeeping
                if ((epoch == 0) && (!second)) {
                    if (num_remain_sent == 0) {
                        cout << "***WARNING*** document " << doc << " is empty" << endl;
                        empty_docs += 1;
                    }
                    if (num_remain_sent == 1) {
                        cout << "***WARNING*** document " << doc <<
                                " has one sentence" << endl;
                        one_sent_docs += 1;
                    }
                }

                // If we have more than two sentences.
                if (num_remain_sent > 1) {

                    // Set values.
                    auto size = uint32_t{0};
                    auto num_sent = uint32_t{0};
                    auto seq_len = get_sample_len(short_seq_ratio, max_seq_length);

                    // Loop through sentences.
                    for (auto sent_index=sent_index_first;
                         sent_index < sent_index_last; ++sent_index) {

                        // Add the size and number of sentences.
                        size += sizes[sent_index];
                        num_sent += 1;
                        num_remain_sent -= 1;

                        // If we have reached the target length.
                        // and if not only one sentence is left in the document.
                        // and if we have at least two sentneces.
                        // and if we have reached end of the document.
                        if (((size >= seq_len) && (num_remain_sent > 1) &&
                             (num_sent > 1) ) || (num_remain_sent == 0)) {

                            // Populate the map.
                            if (second) {
                                const auto map_index_0 = 3 * map_index;
                                maps[map_index_0] = prev_start_index;
                                maps[map_index_0 + 1] = sent_index + 1;
                                maps[map_index_0 + 2] = seq_len;
                            }

                            // Update indices / counters.
                            // check for overflow
                            if (map_index == std::numeric_limits<DocIdx>::max()) {
                                cout << "number of samples exceeded maximum allowed by type: "
                                     << std::numeric_limits<DocIdx>::max() << endl;
                                throw std::overflow_error("Number of samples");
                            }
                            map_index += 1;
                            prev_start_index = sent_index + 1;
                            seq_len = get_sample_len(short_seq_ratio, max_seq_length);
                            size = 0;
                            num_sent = 0;
                        }
                    }

                } // if (num_remain_sent > 1) {
            } // for (int doc=0; doc < num_docs; ++doc) {
        } // for (int epoch=0; epoch < num_epochs; ++epoch) {

        if (!second) {
            cout << "    number of samples:                      " <<
                    map_index << endl;
            cout << "    number of empty documents:              " <<
                    empty_docs << endl;
            cout << "    number of documents with one sentence:  " <<
                    one_sent_docs << endl;
            maps = new DocIdx[3*map_index];
            num_samples = map_index;
        }

    } // for (int iteration=0; iteration < 2; ++iteration) {

    // Shuffle.
    for (auto i=(num_samples - 1); i > 0; --i) {
        const auto j = rand() % (i + 1);
        const auto i0 = 3 * i;
        const auto j0 = 3 * j;
        // Swap values.
        swap(maps[i0], maps[j0]);
        swap(maps[i0 + 1], maps[j0 + 1]);
        swap(maps[i0 + 2], maps[j0 + 2]);
Mohammad Shoeybi's avatar
Mohammad Shoeybi committed
178
179
    }

180
    cout << " > done building the mapping." << endl;
Mohammad Shoeybi's avatar
Mohammad Shoeybi committed
181

182
183
184
185
186
187
    // Method to deallocate memory.
    py::capsule free_when_done(maps, [](void *mem_) {
            DocIdx *mem = reinterpret_cast<DocIdx*>(mem_);
            cout << "freeing memory for the dataset mapping" << endl;
            delete[] mem;
        });
Mohammad Shoeybi's avatar
Mohammad Shoeybi committed
188

189
190
191
192
193
    // Return the numpy array.
    return py::array(std::vector<int64_t>{num_samples, 3}, // shape
                     {3*4, 4}, // C-style contiguous strides
                     maps, // the data pointer
                     free_when_done); // numpy array references
Mohammad Shoeybi's avatar
Mohammad Shoeybi committed
194

195
}
Mohammad Shoeybi's avatar
Mohammad Shoeybi committed
196

197
198
199
200
201
202
203
204
205
206
207
208
209
py::array build_mapping(const py::array& docs_,
                        const py::array& sizes_,
                        const int num_epochs,
                        const uint64_t max_num_samples,
                        const int max_seq_length,
                        const double short_seq_prob,
                        const int seed) {
    if (sizes_.size() > std::numeric_limits<uint32_t>::max()) {
        return build_mapping_impl<uint64_t>(docs_, sizes_, num_epochs, max_num_samples,
                                            max_seq_length, short_seq_prob, seed);
    } else {
        return build_mapping_impl<uint32_t>(docs_, sizes_, num_epochs, max_num_samples,
                                            max_seq_length, short_seq_prob, seed);
Mohammad Shoeybi's avatar
Mohammad Shoeybi committed
210
211
212
213
    }
}

PYBIND11_MODULE(helpers, m) {
214
    m.def("build_mapping", &build_mapping);
Mohammad Shoeybi's avatar
Mohammad Shoeybi committed
215
}