"git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "cdba71475f3751e26b62beb3e6bfec9a3f6f6962"
test_arrow.cpp 10.2 KB
Newer Older
1
2
3
4
5
6
7
8
/*!
 * Copyright (c) 2023 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 *
 * Author: Oliver Borchert
 */

#include <LightGBM/arrow.h>
9
#include <gtest/gtest.h>
10
11

#include <cmath>
12
#include <cstdlib>
13
14
15
16

using LightGBM::ArrowChunkedArray;
using LightGBM::ArrowTable;

17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
/* --------------------------------------------------------------------------------------------- */
/*                                             UTILS                                             */
/* --------------------------------------------------------------------------------------------- */
// This code is copied and adapted from the official Arrow producer examples:
// https://arrow.apache.org/docs/format/CDataInterface.html#exporting-a-struct-float32-utf8-array

static void release_schema(struct ArrowSchema* schema) {
  // Free children
  if (schema->children) {
    for (int64_t i = 0; i < schema->n_children; ++i) {
      struct ArrowSchema* child = schema->children[i];
      if (child->release) {
        child->release(child);
      }
      free(child);
    }
    free(schema->children);
  }

  // Finalize
  schema->release = nullptr;
}

static void release_array(struct ArrowArray* array) {
  // Free children
  if (array->children) {
    for (int64_t i = 0; i < array->n_children; ++i) {
      struct ArrowArray* child = array->children[i];
      if (child->release) {
        child->release(child);
      }
      free(child);
    }
    free(array->children);
  }

  // Free buffers
  for (int64_t i = 0; i < array->n_buffers; ++i) {
    if (array->buffers[i]) {
      free(const_cast<void*>(array->buffers[i]));
    }
  }
  free(array->buffers);

  // Finalize
  array->release = nullptr;
}

/* ------------------------------------------ PRODUCER ----------------------------------------- */

67
68
69
70
class ArrowChunkedArrayTest : public testing::Test {
 protected:
  void SetUp() override {}

71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
  /* -------------------------------------- ARRAY CREATION ------------------------------------- */

  char* build_validity_bitmap(int64_t size, std::vector<int64_t> null_indices = {}) {
    if (null_indices.empty()) {
      return nullptr;
    }
    auto num_bytes = (size + 7) / 8;
    auto validity = static_cast<char*>(malloc(num_bytes * sizeof(char)));
    memset(validity, 0xff, num_bytes * sizeof(char));
    for (auto idx : null_indices) {
      validity[idx / 8] &= ~(1 << (idx % 8));
    }
    return validity;
  }

  ArrowArray build_primitive_array(void* data, int64_t size, int64_t offset,
                                   std::vector<int64_t> null_indices) {
    const void** buffers = (const void**)malloc(sizeof(void*) * 2);
    buffers[0] = build_validity_bitmap(size, null_indices);
    buffers[1] = data;

92
    ArrowArray arr;
93
94
95
96
97
98
99
    arr.length = size - offset;
    arr.null_count = static_cast<int64_t>(null_indices.size());
    arr.offset = offset;
    arr.n_buffers = 2;
    arr.n_children = 0;
    arr.buffers = buffers;
    arr.children = nullptr;
100
    arr.dictionary = nullptr;
101
    arr.release = &release_array;
102
103
104
105
106
    arr.private_data = nullptr;
    return arr;
  }

  template <typename T>
107
  ArrowArray create_primitive_array(const std::vector<T>& values, int64_t offset = 0,
108
109
                                    std::vector<int64_t> null_indices = {}) {
    // NOTE: Arrow arrays have 64-bit alignment but we can safely ignore this in tests
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
    auto buffer = static_cast<T*>(malloc(sizeof(T) * values.size()));
    for (size_t i = 0; i < values.size(); ++i) {
      buffer[i] = values[i];
    }
    return build_primitive_array(buffer, values.size(), offset, null_indices);
  }

  ArrowArray create_primitive_array(const std::vector<bool>& values, int64_t offset = 0,
                                    std::vector<int64_t> null_indices = {}) {
    auto num_bytes = (values.size() + 7) / 8;
    auto buffer = static_cast<char*>(calloc(sizeof(char), num_bytes));
    for (size_t i = 0; i < values.size(); ++i) {
      // By using `calloc` above, we only need to set 'true' values
      if (values[i]) {
        buffer[i / 8] |= (1 << (i % 8));
125
126
      }
    }
127
128
    return build_primitive_array(buffer, values.size(), offset, null_indices);
  }
129

130
131
132
133
134
135
136
  ArrowArray created_nested_array(const std::vector<ArrowArray*>& arrays) {
    auto children = static_cast<ArrowArray**>(malloc(sizeof(ArrowArray*) * arrays.size()));
    for (size_t i = 0; i < arrays.size(); ++i) {
      auto child = static_cast<ArrowArray*>(malloc(sizeof(ArrowArray)));
      *child = *arrays[i];
      children[i] = child;
    }
137
138

    ArrowArray arr;
139
    arr.length = children[0]->length;
140
141
    arr.null_count = 0;
    arr.offset = 0;
142
143
144
145
146
147
    arr.n_buffers = 0;
    arr.n_children = static_cast<int64_t>(arrays.size());
    arr.buffers = nullptr;
    arr.children = children;
    arr.dictionary = nullptr;
    arr.release = &release_array;
148
149
150
    arr.private_data = nullptr;
    return arr;
  }
151
};
152
153


154
/* ------------------------------------- SCHEMA CREATION ------------------------------------- */
155

156
157
158
159
template <typename T>
ArrowSchema create_primitive_schema() {
  std::logic_error("not implemented");
}
160

161
162
163
164
165
166
167
168
169
170
171
172
173
174
template <>
ArrowSchema create_primitive_schema<float>() {
  ArrowSchema schema;
  schema.format = "f";
  schema.name = nullptr;
  schema.metadata = nullptr;
  schema.flags = 0;
  schema.n_children = 0;
  schema.children = nullptr;
  schema.dictionary = nullptr;
  schema.release = nullptr;
  schema.private_data = nullptr;
  return schema;
}
175

176
177
178
179
180
181
182
183
184
185
186
187
188
189
template <>
ArrowSchema create_primitive_schema<bool>() {
  ArrowSchema schema;
  schema.format = "b";
  schema.name = nullptr;
  schema.metadata = nullptr;
  schema.flags = 0;
  schema.n_children = 0;
  schema.children = nullptr;
  schema.dictionary = nullptr;
  schema.release = nullptr;
  schema.private_data = nullptr;
  return schema;
}
190

191
192
193
194
195
196
ArrowSchema create_nested_schema(const std::vector<ArrowSchema*>& arrays) {
  auto children = static_cast<ArrowSchema**>(malloc(sizeof(ArrowSchema*) * arrays.size()));
  for (size_t i = 0; i < arrays.size(); ++i) {
    auto child = static_cast<ArrowSchema*>(malloc(sizeof(ArrowSchema)));
    *child = *arrays[i];
    children[i] = child;
197
  }
198
199
200
201
202
203
204
205
206
207
208
209
210

  ArrowSchema schema;
  schema.format = "+s";
  schema.name = nullptr;
  schema.metadata = nullptr;
  schema.flags = 0;
  schema.n_children = static_cast<int64_t>(arrays.size());
  schema.children = children;
  schema.dictionary = nullptr;
  schema.release = &release_schema;
  schema.private_data = nullptr;
  return schema;
}
211

212
213
214
215
/* --------------------------------------------------------------------------------------------- */
/*                                             TESTS                                             */
/* --------------------------------------------------------------------------------------------- */

216
TEST_F(ArrowChunkedArrayTest, GetLength) {
217
218
  auto schema = create_primitive_schema<float>();

219
220
  std::vector<float> dat1 = {1, 2};
  auto arr1 = create_primitive_array(dat1);
221
  ArrowChunkedArray ca1(1, &arr1, &schema);
222
223
224
  ASSERT_EQ(ca1.get_length(), 2);

  std::vector<float> dat2 = {3, 4, 5, 6};
225
226
227
228
  auto arr2 = create_primitive_array(dat1);
  auto arr3 = create_primitive_array(dat2);
  ArrowArray arrs[2] = {arr2, arr3};
  ArrowChunkedArray ca2(2, arrs, &schema);
229
230
  ASSERT_EQ(ca2.get_length(), 6);

231
232
233
234
  std::vector<bool> dat3 = {true, false, true, true};
  auto arr4 = create_primitive_array(dat3, 1);
  ArrowChunkedArray ca3(1, &arr4, &schema);
  ASSERT_EQ(ca3.get_length(), 3);
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
}

TEST_F(ArrowChunkedArrayTest, GetColumns) {
  std::vector<float> dat1 = {1, 2, 3};
  auto arr1 = create_primitive_array(dat1);
  std::vector<float> dat2 = {4, 5, 6};
  auto arr2 = create_primitive_array(dat2);
  std::vector<ArrowArray*> arrs = {&arr1, &arr2};
  auto arr = created_nested_array(arrs);

  auto schema1 = create_primitive_schema<float>();
  auto schema2 = create_primitive_schema<float>();
  std::vector<ArrowSchema*> schemas = {&schema1, &schema2};
  auto schema = create_nested_schema(schemas);

  ArrowTable table(1, &arr, &schema);
  ASSERT_EQ(table.get_num_rows(), 3);
  ASSERT_EQ(table.get_num_columns(), 2);

  auto ca1 = table.get_column(0);
  ASSERT_EQ(ca1.get_length(), 3);
  ASSERT_EQ(*ca1.begin<int32_t>(), 1);

  auto ca2 = table.get_column(1);
  ASSERT_EQ(ca2.get_length(), 3);
  ASSERT_EQ(*ca2.begin<int32_t>(), 4);
}

TEST_F(ArrowChunkedArrayTest, IteratorArithmetic) {
  std::vector<float> dat1 = {1, 2};
265
  auto arr1 = create_primitive_array(dat1);
266
  std::vector<float> dat2 = {3, 4, 5, 6};
267
  auto arr2 = create_primitive_array(dat2);
268
  std::vector<float> dat3 = {7};
269
  auto arr3 = create_primitive_array(dat3);
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
  auto schema = create_primitive_schema<float>();

  ArrowArray arrs[3] = {arr1, arr2, arr3};
  ArrowChunkedArray ca(3, arrs, &schema);

  // Arithmetic
  auto it = ca.begin<int32_t>();
  ASSERT_EQ(*it, 1);
  ++it;
  ASSERT_EQ(*it, 2);
  ++it;
  ASSERT_EQ(*it, 3);
  it += 2;
  ASSERT_EQ(*it, 5);
  it += 2;
  ASSERT_EQ(*it, 7);
  --it;
  ASSERT_EQ(*it, 6);

  // Subscripts
  ASSERT_EQ(it[0], 1);
  ASSERT_EQ(it[1], 2);
  ASSERT_EQ(it[2], 3);
  ASSERT_EQ(it[6], 7);

  // End
  auto end = ca.end<int32_t>();
  ASSERT_EQ(end - it, 2);
  ASSERT_EQ(end - ca.begin<int32_t>(), 7);
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
}

TEST_F(ArrowChunkedArrayTest, BooleanIterator) {
  std::vector<bool> dat1 = {false, true, false};
  auto arr1 = create_primitive_array(dat1, 0, {2});
  std::vector<bool> dat2 = {false, false, false, false, true, true, true, true, false, true};
  auto arr2 = create_primitive_array(dat2, 1);
  auto schema = create_primitive_schema<bool>();

  ArrowArray arrs[2] = {arr1, arr2};
  ArrowChunkedArray ca(2, arrs, &schema);

  // Check for values in first chunk
  auto it = ca.begin<float>();
  ASSERT_EQ(*it, 0);
  ASSERT_EQ(*(++it), 1);
  ASSERT_TRUE(std::isnan(*(++it)));

  // Check for some values in second chunk
  ASSERT_EQ(*(++it), 0);
  it += 3;
  ASSERT_EQ(*it, 1);
  it += 4;
  ASSERT_EQ(*it, 0);
  ASSERT_EQ(*(++it), 1);
324

325
326
  // Check end
  ASSERT_EQ(++it, ca.end<float>());
327
328
329
330
}

TEST_F(ArrowChunkedArrayTest, OffsetAndValidity) {
  std::vector<float> dat = {0, 1, 2, 3, 4, 5, 6};
331
  auto arr = create_primitive_array(dat, 2, {2, 3});
332
333
334
335
336
337
338
339
340
341
342
  auto schema = create_primitive_schema<float>();
  ArrowChunkedArray ca(1, &arr, &schema);

  auto it = ca.begin<double>();
  ASSERT_TRUE(std::isnan(*it));
  ASSERT_TRUE(std::isnan(*(++it)));
  ASSERT_EQ(it[2], 4);
  ASSERT_EQ(it[4], 6);

  arr.release(&arr);
}