test_stream.cpp 12.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
/*!
 * Copyright (c) 2022 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */

#include <gtest/gtest.h>
#include <testutils.h>
#include <LightGBM/utils/log.h>
#include <LightGBM/c_api.h>
#include <LightGBM/dataset.h>

#include <iostream>
13
14
#include <string>
#include <vector>
15
16
17
18
19
20
21

using LightGBM::Dataset;
using LightGBM::Log;
using LightGBM::TestUtils;

void test_stream_dense(
  int8_t creation_type,
22
  DatasetHandle ref_dataset_handle,
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
  int32_t nrows,
  int32_t ncols,
  int32_t nclasses,
  int batch_count,
  const std::vector<double>* features,
  const std::vector<float>* labels,
  const std::vector<float>* weights,
  const std::vector<double>* init_scores,
  const std::vector<int32_t>* groups) {
  Log::Info("Streaming %d rows dense data with a batch size of %d", nrows, batch_count);
  DatasetHandle dataset_handle = nullptr;
  Dataset* dataset = nullptr;

  int has_weights = weights != nullptr;
  int has_init_scores = init_scores != nullptr;
  int has_queries = groups != nullptr;

40
41
42
  bool succeeded = true;
  std::string exceptionText("");

43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
  try {
    int result = 0;
    switch (creation_type) {
      case 0: {
        Log::Info("Creating Dataset using LGBM_DatasetCreateFromSampledColumn, %d rows dense data with a batch size of %d", nrows, batch_count);

        // construct sample data first (use all data for convenience and since size is small)
        std::vector<std::vector<double>> sample_values(ncols);
        std::vector<std::vector<int>> sample_idx(ncols);
        const double* current_val = features->data();
        for (int32_t idx = 0; idx < nrows; ++idx) {
          for (int32_t k = 0; k < ncols; ++k) {
            if (std::fabs(*current_val) > 1e-35f || std::isnan(*current_val)) {
              sample_values[k].emplace_back(*current_val);
              sample_idx[k].emplace_back(static_cast<int>(idx));
            }
            current_val++;
          }
        }

        std::vector<int> sample_sizes;
        std::vector<double*> sample_values_ptrs;
        std::vector<int*> sample_idx_ptrs;
        for (int32_t i = 0; i < ncols; ++i) {
          sample_values_ptrs.push_back(sample_values[i].data());
          sample_idx_ptrs.push_back(sample_idx[i].data());
          sample_sizes.push_back(static_cast<int>(sample_values[i].size()));
        }

        result = LGBM_DatasetCreateFromSampledColumn(
          sample_values_ptrs.data(),
          sample_idx_ptrs.data(),
          ncols,
          sample_sizes.data(),
          nrows,
          nrows,
          nrows,
          "max_bin=15",
          &dataset_handle);
        EXPECT_EQ(0, result) << "LGBM_DatasetCreateFromSampledColumn result code: " << result;

84
        result = LGBM_DatasetInitStreaming(dataset_handle, has_weights, has_init_scores, has_queries, nclasses, 1, -1);
85
86
87
88
89
90
        EXPECT_EQ(0, result) << "LGBM_DatasetInitStreaming result code: " << result;
        break;
      }

      case 1:
        Log::Info("Creating Dataset using LGBM_DatasetCreateByReference, %d rows dense data with a batch size of %d", nrows, batch_count);
91
        result = LGBM_DatasetCreateByReference(ref_dataset_handle, nrows, &dataset_handle);
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
        EXPECT_EQ(0, result) << "LGBM_DatasetCreateByReference result code: " << result;
        break;
    }

    dataset = static_cast<Dataset*>(dataset_handle);

    Log::Info("Streaming dense dataset, %d rows dense data with a batch size of %d", nrows, batch_count);
    TestUtils::StreamDenseDataset(
      dataset_handle,
      nrows,
      ncols,
      nclasses,
      batch_count,
      features,
      labels,
      weights,
      init_scores,
      groups);

    dataset->FinishLoad();

    TestUtils::AssertMetadata(&dataset->metadata(),
                              labels,
                              weights,
                              init_scores,
                              groups);
  }
119
120
121
  catch (std::exception& ex) {
    succeeded = false;
    exceptionText = std::string(ex.what());
122
123
124
125
126
127
  }

  if (dataset_handle) {
    int result = LGBM_DatasetFree(dataset_handle);
    EXPECT_EQ(0, result) << "LGBM_DatasetFree result code: " << result;
  }
128
129
130
131

  if (!succeeded) {
    FAIL() << "Test Dense Stream failed with exception: " << exceptionText;
  }
132
133
134
135
}

void test_stream_sparse(
  int8_t creation_type,
136
  DatasetHandle ref_dataset_handle,
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
  int32_t nrows,
  int32_t ncols,
  int32_t nclasses,
  int batch_count,
  const std::vector<int32_t>* indptr,
  const std::vector<int32_t>* indices,
  const std::vector<double>* vals,
  const std::vector<float>* labels,
  const std::vector<float>* weights,
  const std::vector<double>* init_scores,
  const std::vector<int32_t>* groups) {
  Log::Info("Streaming %d rows sparse data with a batch size of %d", nrows, batch_count);
  DatasetHandle dataset_handle = nullptr;
  Dataset* dataset = nullptr;

  int has_weights = weights != nullptr;
  int has_init_scores = init_scores != nullptr;
  int has_queries = groups != nullptr;

156
157
158
  bool succeeded = true;
  std::string exceptionText("");

159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
  try {
    int result = 0;
    switch (creation_type) {
      case 0: {
        Log::Info("Creating Dataset using LGBM_DatasetCreateFromSampledColumn, %d rows sparse data with a batch size of %d", nrows, batch_count);

        std::vector<std::vector<double>> sample_values(ncols);
        std::vector<std::vector<int>> sample_idx(ncols);
        for (size_t i = 0; i < indptr->size() - 1; ++i) {
          int start_index = indptr->at(i);
          int stop_index = indptr->at(i + 1);
          for (int32_t j = start_index; j < stop_index; ++j) {
            auto val = vals->at(j);
            auto idx = indices->at(j);
            if (std::fabs(val) > 1e-35f || std::isnan(val)) {
              sample_values[idx].emplace_back(val);
              sample_idx[idx].emplace_back(static_cast<int>(i));
            }
          }
        }

        std::vector<int> sample_sizes;
        std::vector<double*> sample_values_ptrs;
        std::vector<int*> sample_idx_ptrs;
        for (int32_t i = 0; i < ncols; ++i) {
          sample_values_ptrs.push_back(sample_values[i].data());
          sample_idx_ptrs.push_back(sample_idx[i].data());
          sample_sizes.push_back(static_cast<int>(sample_values[i].size()));
        }

        result = LGBM_DatasetCreateFromSampledColumn(
          sample_values_ptrs.data(),
          sample_idx_ptrs.data(),
          ncols,
          sample_sizes.data(),
          nrows,
          nrows,
          nrows,
          "max_bin=15",
          &dataset_handle);
        EXPECT_EQ(0, result) << "LGBM_DatasetCreateFromSampledColumn result code: " << result;

        dataset = static_cast<Dataset*>(dataset_handle);
202
        dataset->InitStreaming(nrows, has_weights, has_init_scores, has_queries, nclasses, 2, -1);
203
204
205
206
207
        break;
      }

      case 1:
        Log::Info("Creating Dataset using LGBM_DatasetCreateByReference, %d rows sparse data with a batch size of %d", nrows, batch_count);
208
        result = LGBM_DatasetCreateByReference(ref_dataset_handle, nrows, &dataset_handle);
209
210
211
212
213
214
        EXPECT_EQ(0, result) << "LGBM_DatasetCreateByReference result code: " << result;
        break;
    }

    dataset = static_cast<Dataset*>(dataset_handle);

215
    Log::Info("Streaming sparse dataset, %d rows sparse data with a batch size of %d", nrows, batch_count);
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
    TestUtils::StreamSparseDataset(
      dataset_handle,
      nrows,
      nclasses,
      batch_count,
      indptr,
      indices,
      vals,
      labels,
      weights,
      init_scores,
      groups);

    dataset->FinishLoad();

    TestUtils::AssertMetadata(&dataset->metadata(),
                              labels,
                              weights,
                              init_scores,
                              groups);
  }
237
238
239
  catch (std::exception& ex) {
    succeeded = false;
    exceptionText = std::string(ex.what());
240
241
242
243
244
245
  }

  if (dataset_handle) {
    int result = LGBM_DatasetFree(dataset_handle);
    EXPECT_EQ(0, result) << "LGBM_DatasetFree result code: " << result;
  }
246
247
248
249

  if (!succeeded) {
    FAIL() << "Test Sparse Stream failed with exception: " << exceptionText;
  }
250
251
252
253
}

TEST(Stream, PushDenseRowsWithMetadata) {
  // Load some test data
254
  DatasetHandle ref_dataset_handle;
255
256
  const char* params = "max_bin=15";
  // Use the smaller ".test" data because we don't care about the actual data and it's smaller
257
  int result = TestUtils::LoadDatasetFromExamples("binary_classification/binary.test", params, &ref_dataset_handle);
258
259
  EXPECT_EQ(0, result) << "LoadDatasetFromExamples result code: " << result;

260
  Dataset* ref_dataset = static_cast<Dataset*>(ref_dataset_handle);
261
262
263
264
265
266
267
268
269
270
  auto noriginalrows = ref_dataset->num_data();
  Log::Info("Row count: %d", noriginalrows);
  Log::Info("Feature group count: %d", ref_dataset->num_features());

  // Add some fake initial_scores and groups so we can test streaming them
  int nclasses = 2;  // choose > 1 just to test multi-class handling
  std::vector<double> unused_init_scores;
  unused_init_scores.resize(noriginalrows * nclasses);
  std::vector<int32_t> unused_groups;
  unused_groups.assign(noriginalrows, 1);
271
  result = LGBM_DatasetSetField(ref_dataset_handle, "init_score", unused_init_scores.data(), noriginalrows * nclasses, 1);
272
  EXPECT_EQ(0, result) << "LGBM_DatasetSetField init_score result code: " << result;
273
  result = LGBM_DatasetSetField(ref_dataset_handle, "group", unused_groups.data(), noriginalrows, 2);
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
  EXPECT_EQ(0, result) << "LGBM_DatasetSetField group result code: " << result;

  // Now use the reference dataset schema to make some testable Datasets with N rows each
  int32_t nrows = 1000;
  int32_t ncols = ref_dataset->num_features();
  std::vector<double> features;
  std::vector<float> labels;
  std::vector<float> weights;
  std::vector<double> init_scores;
  std::vector<int32_t> groups;

  Log::Info("Creating random data");
  TestUtils::CreateRandomDenseData(nrows, ncols, nclasses, &features, &labels, &weights, &init_scores, &groups);

  const std::vector<int32_t> batch_counts = { 1, nrows / 100, nrows / 10, nrows };
  const std::vector<int8_t> creation_types = { 0, 1 };

  for (size_t i = 0; i < creation_types.size(); ++i) {  // from sampled data or reference
    for (size_t j = 0; j < batch_counts.size(); ++j) {
      auto type = creation_types[i];
      auto batch_count = batch_counts[j];
295
      test_stream_dense(type, ref_dataset_handle, nrows, ncols, nclasses, batch_count, &features, &labels, &weights, &init_scores, &groups);
296
297
298
    }
  }

299
  result = LGBM_DatasetFree(ref_dataset_handle);
300
301
302
303
304
  EXPECT_EQ(0, result) << "LGBM_DatasetFree result code: " << result;
}

TEST(Stream, PushSparseRowsWithMetadata) {
  // Load some test data
305
  DatasetHandle ref_dataset_handle;
306
307
  const char* params = "max_bin=15";
  // Use the smaller ".test" data because we don't care about the actual data and it's smaller
308
  int result = TestUtils::LoadDatasetFromExamples("binary_classification/binary.test", params, &ref_dataset_handle);
309
310
  EXPECT_EQ(0, result) << "LoadDatasetFromExamples result code: " << result;

311
  Dataset* ref_dataset = static_cast<Dataset*>(ref_dataset_handle);
312
313
314
315
316
317
318
319
320
321
  auto noriginalrows = ref_dataset->num_data();
  Log::Info("Row count: %d", noriginalrows);
  Log::Info("Feature group count: %d", ref_dataset->num_features());

  // Add some fake initial_scores and groups so we can test streaming them
  int32_t nclasses = 2;
  std::vector<double> unused_init_scores;
  unused_init_scores.resize(noriginalrows * nclasses);
  std::vector<int32_t> unused_groups;
  unused_groups.assign(noriginalrows, 1);
322
  result = LGBM_DatasetSetField(ref_dataset_handle, "init_score", unused_init_scores.data(), noriginalrows * nclasses, 1);
323
  EXPECT_EQ(0, result) << "LGBM_DatasetSetField init_score result code: " << result;
324
  result = LGBM_DatasetSetField(ref_dataset_handle, "group", unused_groups.data(), noriginalrows, 2);
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
  EXPECT_EQ(0, result) << "LGBM_DatasetSetField group result code: " << result;

  // Now use the reference dataset schema to make some testable Datasets with N rows each
  int32_t nrows = 1000;
  int32_t ncols = ref_dataset->num_features();
  std::vector<int32_t> indptr;
  std::vector<int32_t> indices;
  std::vector<double> vals;
  std::vector<float> labels;
  std::vector<float> weights;
  std::vector<double> init_scores;
  std::vector<int32_t> groups;

  Log::Info("Creating random data");
  float sparse_percent = .1f;
  TestUtils::CreateRandomSparseData(nrows, ncols, nclasses, sparse_percent, &indptr, &indices, &vals, &labels, &weights, &init_scores, &groups);

  const std::vector<int32_t> batch_counts = { 1, nrows / 100, nrows / 10, nrows };
343
  const std::vector<int8_t> creation_types = { 0, 1 };
344
345
346
347
348

  for (size_t i = 0; i < creation_types.size(); ++i) {  // from sampled data or reference
    for (size_t j = 0; j < batch_counts.size(); ++j) {
      auto type = creation_types[i];
      auto batch_count = batch_counts[j];
349
      test_stream_sparse(type, ref_dataset_handle, nrows, ncols, nclasses, batch_count, &indptr, &indices, &vals, &labels, &weights, &init_scores, &groups);
350
351
352
    }
  }

353
  result = LGBM_DatasetFree(ref_dataset_handle);
354
355
  EXPECT_EQ(0, result) << "LGBM_DatasetFree result code: " << result;
}