test_arrow.cpp 10.3 KB
Newer Older
1
2
3
4
5
6
7
8
/*!
 * Copyright (c) 2023 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 *
 * Author: Oliver Borchert
 */

#include <LightGBM/arrow.h>
9
#include <gtest/gtest.h>
10
11

#include <cmath>
12
#include <cstdlib>
13
#include <vector>
14
15
16
17

using LightGBM::ArrowChunkedArray;
using LightGBM::ArrowTable;

18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
/* --------------------------------------------------------------------------------------------- */
/*                                             UTILS                                             */
/* --------------------------------------------------------------------------------------------- */
// This code is copied and adapted from the official Arrow producer examples:
// https://arrow.apache.org/docs/format/CDataInterface.html#exporting-a-struct-float32-utf8-array

static void release_schema(struct ArrowSchema* schema) {
  // Free children
  if (schema->children) {
    for (int64_t i = 0; i < schema->n_children; ++i) {
      struct ArrowSchema* child = schema->children[i];
      if (child->release) {
        child->release(child);
      }
      free(child);
    }
    free(schema->children);
  }

  // Finalize
  schema->release = nullptr;
}

static void release_array(struct ArrowArray* array) {
  // Free children
  if (array->children) {
    for (int64_t i = 0; i < array->n_children; ++i) {
      struct ArrowArray* child = array->children[i];
      if (child->release) {
        child->release(child);
      }
      free(child);
    }
    free(array->children);
  }

  // Free buffers
  for (int64_t i = 0; i < array->n_buffers; ++i) {
    if (array->buffers[i]) {
      free(const_cast<void*>(array->buffers[i]));
    }
  }
  free(array->buffers);

  // Finalize
  array->release = nullptr;
}

/* ------------------------------------------ PRODUCER ----------------------------------------- */

68
69
70
71
class ArrowChunkedArrayTest : public testing::Test {
 protected:
  void SetUp() override {}

72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
  /* -------------------------------------- ARRAY CREATION ------------------------------------- */

  char* build_validity_bitmap(int64_t size, std::vector<int64_t> null_indices = {}) {
    if (null_indices.empty()) {
      return nullptr;
    }
    auto num_bytes = (size + 7) / 8;
    auto validity = static_cast<char*>(malloc(num_bytes * sizeof(char)));
    memset(validity, 0xff, num_bytes * sizeof(char));
    for (auto idx : null_indices) {
      validity[idx / 8] &= ~(1 << (idx % 8));
    }
    return validity;
  }

  ArrowArray build_primitive_array(void* data, int64_t size, int64_t offset,
                                   std::vector<int64_t> null_indices) {
    const void** buffers = (const void**)malloc(sizeof(void*) * 2);
    buffers[0] = build_validity_bitmap(size, null_indices);
    buffers[1] = data;

93
    ArrowArray arr;
94
95
96
97
98
99
100
    arr.length = size - offset;
    arr.null_count = static_cast<int64_t>(null_indices.size());
    arr.offset = offset;
    arr.n_buffers = 2;
    arr.n_children = 0;
    arr.buffers = buffers;
    arr.children = nullptr;
101
    arr.dictionary = nullptr;
102
    arr.release = &release_array;
103
104
105
106
107
    arr.private_data = nullptr;
    return arr;
  }

  template <typename T>
108
  ArrowArray create_primitive_array(const std::vector<T>& values, int64_t offset = 0,
109
110
                                    std::vector<int64_t> null_indices = {}) {
    // NOTE: Arrow arrays have 64-bit alignment but we can safely ignore this in tests
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
    auto buffer = static_cast<T*>(malloc(sizeof(T) * values.size()));
    for (size_t i = 0; i < values.size(); ++i) {
      buffer[i] = values[i];
    }
    return build_primitive_array(buffer, values.size(), offset, null_indices);
  }

  ArrowArray create_primitive_array(const std::vector<bool>& values, int64_t offset = 0,
                                    std::vector<int64_t> null_indices = {}) {
    auto num_bytes = (values.size() + 7) / 8;
    auto buffer = static_cast<char*>(calloc(sizeof(char), num_bytes));
    for (size_t i = 0; i < values.size(); ++i) {
      // By using `calloc` above, we only need to set 'true' values
      if (values[i]) {
        buffer[i / 8] |= (1 << (i % 8));
126
127
      }
    }
128
129
    return build_primitive_array(buffer, values.size(), offset, null_indices);
  }
130

131
132
133
134
135
136
137
  ArrowArray created_nested_array(const std::vector<ArrowArray*>& arrays) {
    auto children = static_cast<ArrowArray**>(malloc(sizeof(ArrowArray*) * arrays.size()));
    for (size_t i = 0; i < arrays.size(); ++i) {
      auto child = static_cast<ArrowArray*>(malloc(sizeof(ArrowArray)));
      *child = *arrays[i];
      children[i] = child;
    }
138
139

    ArrowArray arr;
140
    arr.length = children[0]->length;
141
142
    arr.null_count = 0;
    arr.offset = 0;
143
144
145
146
147
148
    arr.n_buffers = 0;
    arr.n_children = static_cast<int64_t>(arrays.size());
    arr.buffers = nullptr;
    arr.children = children;
    arr.dictionary = nullptr;
    arr.release = &release_array;
149
150
151
152
    arr.private_data = nullptr;
    return arr;
  }

153
  /* ------------------------------------- SCHEMA CREATION ------------------------------------- */
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169

  template <typename T>
  ArrowSchema create_primitive_schema() {
    std::logic_error("not implemented");
  }

  template <>
  ArrowSchema create_primitive_schema<float>() {
    ArrowSchema schema;
    schema.format = "f";
    schema.name = nullptr;
    schema.metadata = nullptr;
    schema.flags = 0;
    schema.n_children = 0;
    schema.children = nullptr;
    schema.dictionary = nullptr;
170
    schema.release = nullptr;
171
    schema.private_data = nullptr;
172
173
174
175
176
177
178
179
180
181
182
183
184
    return schema;
  }

  template <>
  ArrowSchema create_primitive_schema<bool>() {
    ArrowSchema schema;
    schema.format = "b";
    schema.name = nullptr;
    schema.metadata = nullptr;
    schema.flags = 0;
    schema.n_children = 0;
    schema.children = nullptr;
    schema.dictionary = nullptr;
185
    schema.release = nullptr;
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
    schema.private_data = nullptr;
    return schema;
  }

  ArrowSchema create_nested_schema(const std::vector<ArrowSchema*>& arrays) {
    auto children = static_cast<ArrowSchema**>(malloc(sizeof(ArrowSchema*) * arrays.size()));
    for (size_t i = 0; i < arrays.size(); ++i) {
      auto child = static_cast<ArrowSchema*>(malloc(sizeof(ArrowSchema)));
      *child = *arrays[i];
      children[i] = child;
    }

    ArrowSchema schema;
    schema.format = "+s";
    schema.name = nullptr;
    schema.metadata = nullptr;
    schema.flags = 0;
    schema.n_children = static_cast<int64_t>(arrays.size());
    schema.children = children;
    schema.dictionary = nullptr;
    schema.release = &release_schema;
    schema.private_data = nullptr;
208
209
210
211
    return schema;
  }
};

212
213
214
215
/* --------------------------------------------------------------------------------------------- */
/*                                             TESTS                                             */
/* --------------------------------------------------------------------------------------------- */

216
TEST_F(ArrowChunkedArrayTest, GetLength) {
217
218
  auto schema = create_primitive_schema<float>();

219
220
  std::vector<float> dat1 = {1, 2};
  auto arr1 = create_primitive_array(dat1);
221
  ArrowChunkedArray ca1(1, &arr1, &schema);
222
223
224
  ASSERT_EQ(ca1.get_length(), 2);

  std::vector<float> dat2 = {3, 4, 5, 6};
225
226
227
228
  auto arr2 = create_primitive_array(dat1);
  auto arr3 = create_primitive_array(dat2);
  ArrowArray arrs[2] = {arr2, arr3};
  ArrowChunkedArray ca2(2, arrs, &schema);
229
230
  ASSERT_EQ(ca2.get_length(), 6);

231
232
233
234
  std::vector<bool> dat3 = {true, false, true, true};
  auto arr4 = create_primitive_array(dat3, 1);
  ArrowChunkedArray ca3(1, &arr4, &schema);
  ASSERT_EQ(ca3.get_length(), 3);
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
}

TEST_F(ArrowChunkedArrayTest, GetColumns) {
  std::vector<float> dat1 = {1, 2, 3};
  auto arr1 = create_primitive_array(dat1);
  std::vector<float> dat2 = {4, 5, 6};
  auto arr2 = create_primitive_array(dat2);
  std::vector<ArrowArray*> arrs = {&arr1, &arr2};
  auto arr = created_nested_array(arrs);

  auto schema1 = create_primitive_schema<float>();
  auto schema2 = create_primitive_schema<float>();
  std::vector<ArrowSchema*> schemas = {&schema1, &schema2};
  auto schema = create_nested_schema(schemas);

  ArrowTable table(1, &arr, &schema);
  ASSERT_EQ(table.get_num_rows(), 3);
  ASSERT_EQ(table.get_num_columns(), 2);

  auto ca1 = table.get_column(0);
  ASSERT_EQ(ca1.get_length(), 3);
  ASSERT_EQ(*ca1.begin<int32_t>(), 1);

  auto ca2 = table.get_column(1);
  ASSERT_EQ(ca2.get_length(), 3);
  ASSERT_EQ(*ca2.begin<int32_t>(), 4);
}

TEST_F(ArrowChunkedArrayTest, IteratorArithmetic) {
  std::vector<float> dat1 = {1, 2};
265
  auto arr1 = create_primitive_array(dat1);
266
  std::vector<float> dat2 = {3, 4, 5, 6};
267
  auto arr2 = create_primitive_array(dat2);
268
  std::vector<float> dat3 = {7};
269
  auto arr3 = create_primitive_array(dat3);
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
  auto schema = create_primitive_schema<float>();

  ArrowArray arrs[3] = {arr1, arr2, arr3};
  ArrowChunkedArray ca(3, arrs, &schema);

  // Arithmetic
  auto it = ca.begin<int32_t>();
  ASSERT_EQ(*it, 1);
  ++it;
  ASSERT_EQ(*it, 2);
  ++it;
  ASSERT_EQ(*it, 3);
  it += 2;
  ASSERT_EQ(*it, 5);
  it += 2;
  ASSERT_EQ(*it, 7);
  --it;
  ASSERT_EQ(*it, 6);

  // Subscripts
  ASSERT_EQ(it[0], 1);
  ASSERT_EQ(it[1], 2);
  ASSERT_EQ(it[2], 3);
  ASSERT_EQ(it[6], 7);

  // End
  auto end = ca.end<int32_t>();
  ASSERT_EQ(end - it, 2);
  ASSERT_EQ(end - ca.begin<int32_t>(), 7);
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
}

TEST_F(ArrowChunkedArrayTest, BooleanIterator) {
  std::vector<bool> dat1 = {false, true, false};
  auto arr1 = create_primitive_array(dat1, 0, {2});
  std::vector<bool> dat2 = {false, false, false, false, true, true, true, true, false, true};
  auto arr2 = create_primitive_array(dat2, 1);
  auto schema = create_primitive_schema<bool>();

  ArrowArray arrs[2] = {arr1, arr2};
  ArrowChunkedArray ca(2, arrs, &schema);

  // Check for values in first chunk
  auto it = ca.begin<float>();
  ASSERT_EQ(*it, 0);
  ASSERT_EQ(*(++it), 1);
  ASSERT_TRUE(std::isnan(*(++it)));

  // Check for some values in second chunk
  ASSERT_EQ(*(++it), 0);
  it += 3;
  ASSERT_EQ(*it, 1);
  it += 4;
  ASSERT_EQ(*it, 0);
  ASSERT_EQ(*(++it), 1);
324

325
326
  // Check end
  ASSERT_EQ(++it, ca.end<float>());
327
328
329
330
}

TEST_F(ArrowChunkedArrayTest, OffsetAndValidity) {
  std::vector<float> dat = {0, 1, 2, 3, 4, 5, 6};
331
  auto arr = create_primitive_array(dat, 2, {2, 3});
332
333
334
335
336
337
338
339
340
341
342
  auto schema = create_primitive_schema<float>();
  ArrowChunkedArray ca(1, &arr, &schema);

  auto it = ca.begin<double>();
  ASSERT_TRUE(std::isnan(*it));
  ASSERT_TRUE(std::isnan(*(++it)));
  ASSERT_EQ(it[2], 4);
  ASSERT_EQ(it[4], 6);

  arr.release(&arr);
}