chunked_array.hpp 8.28 KB
Newer Older
1
2
3
4
5
6
/*!
 * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 *
 * Author: Alberto Ferreira
 */
7
8
#ifndef LIGHTGBM_INCLUDE_LIGHTGBM_UTILS_CHUNKED_ARRAY_HPP_
#define LIGHTGBM_INCLUDE_LIGHTGBM_UTILS_CHUNKED_ARRAY_HPP_
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241

#include <LightGBM/utils/log.h>

#include <stdint.h>

#include <algorithm>
#include <new>
#include <vector>


namespace LightGBM {

/**
 * Container that manages a dynamic array of fixed-length chunks.
 *
 * The class also takes care of allocation & release of the underlying
 * memory. It can be used with either a high or low-level API.
 *
 * The high-level API allocates chunks as needed, manages addresses automatically and keeps
 * track of number of inserted elements, but is not thread-safe (this is ok as usually input is a streaming iterator).
 * For parallel input sources the low-level API must be used.
 *
 * Note: When using this for `LGBM_DatasetCreateFromMats` use a
 *       chunk_size multiple of #num_cols for your dataset, so each chunk
 *       contains "complete" instances.
 *
 * === High-level insert API intro ===
 *
 * The easiest way to use is:
 *  0. ChunkedArray(chunk_size)  # Choose appropriate size
 *  1. add(value)                # as many times as you want (will generate chunks as needed)
 *  2. data() or void_data()     # retrieves a T** or void** pointer (useful for `LGBM_DatasetCreateFromMats`).
 *
 * Useful query methods (all O(1)):
 *  - get_add_count()   # total count of added elements.
 *  - get_chunks_count()  # how many chunks are currently allocated.
 *  - get_current_chunk_added_count()  # for the last add() chunk, how many items there are.
 *  - get_chunk_size()    # get constant chunk_size from constructor call.
 *
 *  With those you can generate int32_t sizes[]. Last chunk can be smaller than chunk_size, so, for any i:
 *    - sizes[i<last] = get_chunk_size()
 *    - sizes[i==last] = get_add_count()
 *
 *
 * === Low-level insert API intro ===
 *
 * For advanced usage - useful for inserting in parallel - one can also:
 *  1. call new_chunk() at any time for as many chunks as needed.  (thread-UNsafe)
 *  2. call setitem(chunk, idx, value) to insert each value.       (thread-safe)
 *
 */
template <class T>
class ChunkedArray {
 public:
    explicit ChunkedArray(size_t chunk_size)
      : _chunk_size(chunk_size), _last_chunk_idx(0), _last_idx_in_last_chunk(0) {
      if (chunk_size == 0) {
        Log::Fatal("ChunkedArray chunk size must be larger than 0!");
      }
       new_chunk();
    }

    ~ChunkedArray() {
        release();
    }

    /**
     * Adds a value to the chunks sequentially.
     * If the last chunk is full it creates a new one and appends to it.
     *
     * @param value value to insert.
     */
    void add(T value) {
        if (!within_bounds(_last_chunk_idx, _last_idx_in_last_chunk)) {
            new_chunk();
            ++_last_chunk_idx;
            _last_idx_in_last_chunk = 0;
        }

        CHECK_EQ(setitem(_last_chunk_idx, _last_idx_in_last_chunk, value), 0);
        ++_last_idx_in_last_chunk;
    }

    /**
     * @return Number of add() calls.
     */
    size_t get_add_count() const {
        return _last_chunk_idx * _chunk_size + _last_idx_in_last_chunk;
    }

    /**
     * @return Number of allocated chunks.
     */
    size_t get_chunks_count() const {
        return _chunks.size();
    }

    /**
     * @return Number of elemends add()'ed in the last chunk.
     */
    size_t get_last_chunk_add_count() const {
        return _last_idx_in_last_chunk;
    }

    /**
     * Getter for the chunk size set at the constructor.
     *
     * @return Return the size of chunks.
     */
    size_t get_chunk_size() const {
        return _chunk_size;
    }

    /**
     * Returns the pointer to the raw chunks data.
     *
     * @return T** pointer to raw data.
     */
    T **data() noexcept {
        return _chunks.data();
    }

    /**
     * Returns the pointer to the raw chunks data, but cast to void**.
     * This is so ``LGBM_DatasetCreateFromMats`` accepts it.
     *
     * @return void** pointer to raw data.
     */
    void **data_as_void() noexcept {
        return reinterpret_cast<void**>(_chunks.data());
    }

    /**
     * Coalesces (copies chunked data) to a contiguous array of the same type.
     * It assumes that ``other`` has enough space to receive that data.
     *
     * @param other array with elements T of size >= this->get_add_count().
     * @param all_valid_addresses
     *            If true exports values from all valid addresses independently of add() count.
     *            Otherwise, exports only up to `get_add_count()` addresses.
     */
    void coalesce_to(T *other, bool all_valid_addresses = false) const {
        const size_t full_chunks = this->get_chunks_count() - 1;

        // Copy full chunks:
        size_t i = 0;
        for (size_t chunk = 0; chunk < full_chunks; ++chunk) {
            T* chunk_ptr = _chunks[chunk];
            for (size_t in_chunk_idx = 0; in_chunk_idx < _chunk_size; ++in_chunk_idx) {
                other[i++] = chunk_ptr[in_chunk_idx];
            }
        }
        // Copy filled values from last chunk only:
        const size_t last_chunk_elems_to_copy = all_valid_addresses ? _chunk_size : this->get_last_chunk_add_count();
        T* chunk_ptr = _chunks[full_chunks];
        for (size_t in_chunk_idx = 0; in_chunk_idx < last_chunk_elems_to_copy; ++in_chunk_idx) {
            other[i++] = chunk_ptr[in_chunk_idx];
        }
    }

    /**
     * Return value from array of chunks.
     *
     * @param chunk_index index of the chunk
     * @param index_within_chunk index within chunk
     * @param on_fail_value sentinel value. If out of bounds returns that value.
     *
     * @return pointer or nullptr if index is out of bounds.
     */
    T getitem(size_t chunk_index, size_t index_within_chunk, T on_fail_value) const noexcept {
        if (within_bounds(chunk_index, index_within_chunk))
            return _chunks[chunk_index][index_within_chunk];
        else
            return on_fail_value;
    }

    /**
     * Sets the value at a specific address in one of the chunks.
     *
     * @param chunk_index index of the chunk
     * @param index_within_chunk index within chunk
     * @param value value to store
     *
     * @return 0 = success, -1 = out of bounds access.
     */
    int setitem(size_t chunk_index, size_t index_within_chunk, T value) noexcept {
        if (within_bounds(chunk_index, index_within_chunk)) {
            _chunks[chunk_index][index_within_chunk] = value;
            return 0;
        } else {
            return -1;
        }
    }

    /**
     * To reset storage call this.
     * Will release existing resources and prepare for reuse.
     */
    void clear() noexcept {
        release();
        new_chunk();
    }

    /**
     * Deletes all the allocated chunks.
     * Do not use container after this! See ``clear()`` instead.
     */
    void release() noexcept {
        std::for_each(_chunks.begin(), _chunks.end(), [](T* c) { delete[] c; });
        _chunks.clear();
        _chunks.shrink_to_fit();
        _last_chunk_idx = 0;
        _last_idx_in_last_chunk = 0;
    }

    /**
     * As the array is dynamic, checks whether a given address is currently within bounds.
     *
     * @param chunk_index index of the chunk
     * @param index_within_chunk index within that chunk
     * @return true if that chunk is already allocated and index_within_chunk < chunk size.
     */
    inline bool within_bounds(size_t chunk_index, size_t index_within_chunk) const {
        return (chunk_index < _chunks.size()) && (index_within_chunk < _chunk_size);
    }

    /**
     * Adds a new chunk to the array of chunks. Not thread-safe.
     */
    void new_chunk() {
        _chunks.push_back(new (std::nothrow) T[_chunk_size]);

        // Check memory allocation success:
242
        if (!_chunks[_chunks.size() - 1]) {
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
            release();
            Log::Fatal("Memory exhausted! Cannot allocate new ChunkedArray chunk.");
        }
    }

 private:
    const size_t _chunk_size;
    std::vector<T*> _chunks;

    // For the add() interface & some of the get_*() queries:
    size_t _last_chunk_idx;  //<! Index of chunks
    size_t _last_idx_in_last_chunk;  //<! Index within chunk
};


}  // namespace LightGBM

260
#endif  // LIGHTGBM_INCLUDE_LIGHTGBM_UTILS_CHUNKED_ARRAY_HPP_