test_arrow.cpp 10.3 KB
Newer Older
1
2
3
4
5
6
7
8
/*!
 * Copyright (c) 2023 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 *
 * Author: Oliver Borchert
 */

#include <LightGBM/arrow.h>
9
#include <gtest/gtest.h>
10
11

#include <cmath>
12
#include <cstdlib>
13
14
15
16

using LightGBM::ArrowChunkedArray;
using LightGBM::ArrowTable;

17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
/* --------------------------------------------------------------------------------------------- */
/*                                             UTILS                                             */
/* --------------------------------------------------------------------------------------------- */
// This code is copied and adapted from the official Arrow producer examples:
// https://arrow.apache.org/docs/format/CDataInterface.html#exporting-a-struct-float32-utf8-array

static void release_schema(struct ArrowSchema* schema) {
  // Free children
  if (schema->children) {
    for (int64_t i = 0; i < schema->n_children; ++i) {
      struct ArrowSchema* child = schema->children[i];
      if (child->release) {
        child->release(child);
      }
      free(child);
    }
    free(schema->children);
  }

  // Finalize
  schema->release = nullptr;
}

static void release_array(struct ArrowArray* array) {
  // Free children
  if (array->children) {
    for (int64_t i = 0; i < array->n_children; ++i) {
      struct ArrowArray* child = array->children[i];
      if (child->release) {
        child->release(child);
      }
      free(child);
    }
    free(array->children);
  }

  // Free buffers
  for (int64_t i = 0; i < array->n_buffers; ++i) {
    if (array->buffers[i]) {
      free(const_cast<void*>(array->buffers[i]));
    }
  }
  free(array->buffers);

  // Finalize
  array->release = nullptr;
}

/* ------------------------------------------ PRODUCER ----------------------------------------- */

67
68
69
70
class ArrowChunkedArrayTest : public testing::Test {
 protected:
  void SetUp() override {}

71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
  /* -------------------------------------- ARRAY CREATION ------------------------------------- */

  char* build_validity_bitmap(int64_t size, std::vector<int64_t> null_indices = {}) {
    if (null_indices.empty()) {
      return nullptr;
    }
    auto num_bytes = (size + 7) / 8;
    auto validity = static_cast<char*>(malloc(num_bytes * sizeof(char)));
    memset(validity, 0xff, num_bytes * sizeof(char));
    for (auto idx : null_indices) {
      validity[idx / 8] &= ~(1 << (idx % 8));
    }
    return validity;
  }

  ArrowArray build_primitive_array(void* data, int64_t size, int64_t offset,
                                   std::vector<int64_t> null_indices) {
    const void** buffers = (const void**)malloc(sizeof(void*) * 2);
    buffers[0] = build_validity_bitmap(size, null_indices);
    buffers[1] = data;

92
    ArrowArray arr;
93
94
95
96
97
98
99
    arr.length = size - offset;
    arr.null_count = static_cast<int64_t>(null_indices.size());
    arr.offset = offset;
    arr.n_buffers = 2;
    arr.n_children = 0;
    arr.buffers = buffers;
    arr.children = nullptr;
100
    arr.dictionary = nullptr;
101
    arr.release = &release_array;
102
103
104
105
106
    arr.private_data = nullptr;
    return arr;
  }

  template <typename T>
107
  ArrowArray create_primitive_array(const std::vector<T>& values, int64_t offset = 0,
108
109
                                    std::vector<int64_t> null_indices = {}) {
    // NOTE: Arrow arrays have 64-bit alignment but we can safely ignore this in tests
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
    auto buffer = static_cast<T*>(malloc(sizeof(T) * values.size()));
    for (size_t i = 0; i < values.size(); ++i) {
      buffer[i] = values[i];
    }
    return build_primitive_array(buffer, values.size(), offset, null_indices);
  }

  ArrowArray create_primitive_array(const std::vector<bool>& values, int64_t offset = 0,
                                    std::vector<int64_t> null_indices = {}) {
    auto num_bytes = (values.size() + 7) / 8;
    auto buffer = static_cast<char*>(calloc(sizeof(char), num_bytes));
    for (size_t i = 0; i < values.size(); ++i) {
      // By using `calloc` above, we only need to set 'true' values
      if (values[i]) {
        buffer[i / 8] |= (1 << (i % 8));
125
126
      }
    }
127
128
    return build_primitive_array(buffer, values.size(), offset, null_indices);
  }
129

130
131
132
133
134
135
136
  ArrowArray created_nested_array(const std::vector<ArrowArray*>& arrays) {
    auto children = static_cast<ArrowArray**>(malloc(sizeof(ArrowArray*) * arrays.size()));
    for (size_t i = 0; i < arrays.size(); ++i) {
      auto child = static_cast<ArrowArray*>(malloc(sizeof(ArrowArray)));
      *child = *arrays[i];
      children[i] = child;
    }
137
138

    ArrowArray arr;
139
    arr.length = children[0]->length;
140
141
    arr.null_count = 0;
    arr.offset = 0;
142
143
144
145
146
147
    arr.n_buffers = 0;
    arr.n_children = static_cast<int64_t>(arrays.size());
    arr.buffers = nullptr;
    arr.children = children;
    arr.dictionary = nullptr;
    arr.release = &release_array;
148
149
150
151
    arr.private_data = nullptr;
    return arr;
  }

152
  /* ------------------------------------- SCHEMA CREATION ------------------------------------- */
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168

  template <typename T>
  ArrowSchema create_primitive_schema() {
    std::logic_error("not implemented");
  }

  template <>
  ArrowSchema create_primitive_schema<float>() {
    ArrowSchema schema;
    schema.format = "f";
    schema.name = nullptr;
    schema.metadata = nullptr;
    schema.flags = 0;
    schema.n_children = 0;
    schema.children = nullptr;
    schema.dictionary = nullptr;
169
    schema.release = nullptr;
170
    schema.private_data = nullptr;
171
172
173
174
175
176
177
178
179
180
181
182
183
    return schema;
  }

  template <>
  ArrowSchema create_primitive_schema<bool>() {
    ArrowSchema schema;
    schema.format = "b";
    schema.name = nullptr;
    schema.metadata = nullptr;
    schema.flags = 0;
    schema.n_children = 0;
    schema.children = nullptr;
    schema.dictionary = nullptr;
184
    schema.release = nullptr;
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
    schema.private_data = nullptr;
    return schema;
  }

  ArrowSchema create_nested_schema(const std::vector<ArrowSchema*>& arrays) {
    auto children = static_cast<ArrowSchema**>(malloc(sizeof(ArrowSchema*) * arrays.size()));
    for (size_t i = 0; i < arrays.size(); ++i) {
      auto child = static_cast<ArrowSchema*>(malloc(sizeof(ArrowSchema)));
      *child = *arrays[i];
      children[i] = child;
    }

    ArrowSchema schema;
    schema.format = "+s";
    schema.name = nullptr;
    schema.metadata = nullptr;
    schema.flags = 0;
    schema.n_children = static_cast<int64_t>(arrays.size());
    schema.children = children;
    schema.dictionary = nullptr;
    schema.release = &release_schema;
    schema.private_data = nullptr;
207
208
209
210
    return schema;
  }
};

211
212
213
214
/* --------------------------------------------------------------------------------------------- */
/*                                             TESTS                                             */
/* --------------------------------------------------------------------------------------------- */

215
TEST_F(ArrowChunkedArrayTest, GetLength) {
216
217
  auto schema = create_primitive_schema<float>();

218
219
  std::vector<float> dat1 = {1, 2};
  auto arr1 = create_primitive_array(dat1);
220
  ArrowChunkedArray ca1(1, &arr1, &schema);
221
222
223
  ASSERT_EQ(ca1.get_length(), 2);

  std::vector<float> dat2 = {3, 4, 5, 6};
224
225
226
227
  auto arr2 = create_primitive_array(dat1);
  auto arr3 = create_primitive_array(dat2);
  ArrowArray arrs[2] = {arr2, arr3};
  ArrowChunkedArray ca2(2, arrs, &schema);
228
229
  ASSERT_EQ(ca2.get_length(), 6);

230
231
232
233
  std::vector<bool> dat3 = {true, false, true, true};
  auto arr4 = create_primitive_array(dat3, 1);
  ArrowChunkedArray ca3(1, &arr4, &schema);
  ASSERT_EQ(ca3.get_length(), 3);
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
}

TEST_F(ArrowChunkedArrayTest, GetColumns) {
  std::vector<float> dat1 = {1, 2, 3};
  auto arr1 = create_primitive_array(dat1);
  std::vector<float> dat2 = {4, 5, 6};
  auto arr2 = create_primitive_array(dat2);
  std::vector<ArrowArray*> arrs = {&arr1, &arr2};
  auto arr = created_nested_array(arrs);

  auto schema1 = create_primitive_schema<float>();
  auto schema2 = create_primitive_schema<float>();
  std::vector<ArrowSchema*> schemas = {&schema1, &schema2};
  auto schema = create_nested_schema(schemas);

  ArrowTable table(1, &arr, &schema);
  ASSERT_EQ(table.get_num_rows(), 3);
  ASSERT_EQ(table.get_num_columns(), 2);

  auto ca1 = table.get_column(0);
  ASSERT_EQ(ca1.get_length(), 3);
  ASSERT_EQ(*ca1.begin<int32_t>(), 1);

  auto ca2 = table.get_column(1);
  ASSERT_EQ(ca2.get_length(), 3);
  ASSERT_EQ(*ca2.begin<int32_t>(), 4);
}

TEST_F(ArrowChunkedArrayTest, IteratorArithmetic) {
  std::vector<float> dat1 = {1, 2};
264
  auto arr1 = create_primitive_array(dat1);
265
  std::vector<float> dat2 = {3, 4, 5, 6};
266
  auto arr2 = create_primitive_array(dat2);
267
  std::vector<float> dat3 = {7};
268
  auto arr3 = create_primitive_array(dat3);
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
  auto schema = create_primitive_schema<float>();

  ArrowArray arrs[3] = {arr1, arr2, arr3};
  ArrowChunkedArray ca(3, arrs, &schema);

  // Arithmetic
  auto it = ca.begin<int32_t>();
  ASSERT_EQ(*it, 1);
  ++it;
  ASSERT_EQ(*it, 2);
  ++it;
  ASSERT_EQ(*it, 3);
  it += 2;
  ASSERT_EQ(*it, 5);
  it += 2;
  ASSERT_EQ(*it, 7);
  --it;
  ASSERT_EQ(*it, 6);

  // Subscripts
  ASSERT_EQ(it[0], 1);
  ASSERT_EQ(it[1], 2);
  ASSERT_EQ(it[2], 3);
  ASSERT_EQ(it[6], 7);

  // End
  auto end = ca.end<int32_t>();
  ASSERT_EQ(end - it, 2);
  ASSERT_EQ(end - ca.begin<int32_t>(), 7);
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
}

TEST_F(ArrowChunkedArrayTest, BooleanIterator) {
  std::vector<bool> dat1 = {false, true, false};
  auto arr1 = create_primitive_array(dat1, 0, {2});
  std::vector<bool> dat2 = {false, false, false, false, true, true, true, true, false, true};
  auto arr2 = create_primitive_array(dat2, 1);
  auto schema = create_primitive_schema<bool>();

  ArrowArray arrs[2] = {arr1, arr2};
  ArrowChunkedArray ca(2, arrs, &schema);

  // Check for values in first chunk
  auto it = ca.begin<float>();
  ASSERT_EQ(*it, 0);
  ASSERT_EQ(*(++it), 1);
  ASSERT_TRUE(std::isnan(*(++it)));

  // Check for some values in second chunk
  ASSERT_EQ(*(++it), 0);
  it += 3;
  ASSERT_EQ(*it, 1);
  it += 4;
  ASSERT_EQ(*it, 0);
  ASSERT_EQ(*(++it), 1);
323

324
325
  // Check end
  ASSERT_EQ(++it, ca.end<float>());
326
327
328
329
}

TEST_F(ArrowChunkedArrayTest, OffsetAndValidity) {
  std::vector<float> dat = {0, 1, 2, 3, 4, 5, 6};
330
  auto arr = create_primitive_array(dat, 2, {2, 3});
331
332
333
334
335
336
337
338
339
340
341
  auto schema = create_primitive_schema<float>();
  ArrowChunkedArray ca(1, &arr, &schema);

  auto it = ca.begin<double>();
  ASSERT_TRUE(std::isnan(*it));
  ASSERT_TRUE(std::isnan(*(++it)));
  ASSERT_EQ(it[2], 4);
  ASSERT_EQ(it[4], 6);

  arr.release(&arr);
}