dataset.cpp 64.3 KB
Newer Older
1
2
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
Guolin Ke's avatar
Guolin Ke committed
3
4
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
5
 */
6
7
8
#include <LightGBM/dataset.h>

#include <LightGBM/feature_group.h>
9
#include <LightGBM/cuda/vector_cudahost.h>
10
11
12
13
#include <LightGBM/utils/array_args.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/utils/threading.h>

14
15
16
17
18
19
#include <chrono>
#include <cstdio>
#include <limits>
#include <sstream>
#include <unordered_map>

Guolin Ke's avatar
Guolin Ke committed
20
21
namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
22
23
const char* Dataset::binary_file_token =
    "______LightGBM_Binary_File_Token______\n";
Guolin Ke's avatar
Guolin Ke committed
24

Guolin Ke's avatar
Guolin Ke committed
25
Dataset::Dataset() {
26
  data_filename_ = "noname";
Guolin Ke's avatar
Guolin Ke committed
27
  num_data_ = 0;
Guolin Ke's avatar
Guolin Ke committed
28
  is_finish_load_ = false;
29
  has_raw_ = false;
Guolin Ke's avatar
Guolin Ke committed
30
31
}

32
Dataset::Dataset(data_size_t num_data) {
33
  CHECK_GT(num_data, 0);
Guolin Ke's avatar
Guolin Ke committed
34
  data_filename_ = "noname";
Guolin Ke's avatar
Guolin Ke committed
35
  num_data_ = num_data;
Guolin Ke's avatar
Guolin Ke committed
36
  metadata_.Init(num_data_, NO_SPECIFIC, NO_SPECIFIC);
Guolin Ke's avatar
Guolin Ke committed
37
  is_finish_load_ = false;
Guolin Ke's avatar
Guolin Ke committed
38
  group_bin_boundaries_.push_back(0);
39
  has_raw_ = false;
Guolin Ke's avatar
Guolin Ke committed
40
41
}

Guolin Ke's avatar
Guolin Ke committed
42
Dataset::~Dataset() {}
Guolin Ke's avatar
Guolin Ke committed
43

Guolin Ke's avatar
Guolin Ke committed
44
std::vector<std::vector<int>> NoGroup(const std::vector<int>& used_features) {
Guolin Ke's avatar
Guolin Ke committed
45
46
47
48
49
50
51
52
  std::vector<std::vector<int>> features_in_group;
  features_in_group.resize(used_features.size());
  for (size_t i = 0; i < used_features.size(); ++i) {
    features_in_group[i].emplace_back(used_features[i]);
  }
  return features_in_group;
}

guanqun's avatar
guanqun committed
53
int GetConflictCount(const std::vector<bool>& mark, const int* indices,
Guolin Ke's avatar
Guolin Ke committed
54
                     int num_indices, data_size_t max_cnt) {
Guolin Ke's avatar
Guolin Ke committed
55
56
57
58
  int ret = 0;
  for (int i = 0; i < num_indices; ++i) {
    if (mark[indices[i]]) {
      ++ret;
59
60
61
    }
    if (ret > max_cnt) {
      return -1;
Guolin Ke's avatar
Guolin Ke committed
62
63
64
65
    }
  }
  return ret;
}
66

Guolin Ke's avatar
Guolin Ke committed
67
68
void MarkUsed(std::vector<bool>* mark, const int* indices,
              data_size_t num_indices) {
Guolin Ke's avatar
Guolin Ke committed
69
  auto& ref_mark = *mark;
Guolin Ke's avatar
Guolin Ke committed
70
  for (int i = 0; i < num_indices; ++i) {
Guolin Ke's avatar
Guolin Ke committed
71
    ref_mark[indices[i]] = true;
Guolin Ke's avatar
Guolin Ke committed
72
73
74
  }
}

Guolin Ke's avatar
Guolin Ke committed
75
76
77
78
std::vector<int> FixSampleIndices(const BinMapper* bin_mapper,
                                  int num_total_samples, int num_indices,
                                  const int* sample_indices,
                                  const double* sample_values) {
Guolin Ke's avatar
Guolin Ke committed
79
80
81
82
83
84
85
86
87
  std::vector<int> ret;
  if (bin_mapper->GetDefaultBin() == bin_mapper->GetMostFreqBin()) {
    return ret;
  }
  int i = 0, j = 0;
  while (i < num_total_samples) {
    if (j < num_indices && sample_indices[j] < i) {
      ++j;
    } else if (j < num_indices && sample_indices[j] == i) {
Guolin Ke's avatar
Guolin Ke committed
88
89
      if (bin_mapper->ValueToBin(sample_values[j]) !=
          bin_mapper->GetMostFreqBin()) {
Guolin Ke's avatar
Guolin Ke committed
90
91
92
93
94
95
96
97
98
99
        ret.push_back(i);
      }
      ++i;
    } else {
      ret.push_back(i++);
    }
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
100
101
102
103
104
105
std::vector<std::vector<int>> FindGroups(
    const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
    const std::vector<int>& find_order, int** sample_indices,
    const int* num_per_col, int num_sample_col, data_size_t total_sample_cnt,
    data_size_t num_data, bool is_use_gpu, bool is_sparse,
    std::vector<int8_t>* multi_val_group) {
Guolin Ke's avatar
Guolin Ke committed
106
  const int max_search_group = 100;
107
  const int max_bin_per_group = 256;
Guolin Ke's avatar
Guolin Ke committed
108
109
  const data_size_t single_val_max_conflict_cnt =
      static_cast<data_size_t>(total_sample_cnt / 10000);
110
111
  multi_val_group->clear();

Guolin Ke's avatar
Guolin Ke committed
112
113
114
  Random rand(num_data);
  std::vector<std::vector<int>> features_in_group;
  std::vector<std::vector<bool>> conflict_marks;
115
116
  std::vector<data_size_t> group_used_row_cnt;
  std::vector<data_size_t> group_total_data_cnt;
Guolin Ke's avatar
Guolin Ke committed
117
118
  std::vector<int> group_num_bin;

119
  // first round: fill the single val group
Guolin Ke's avatar
Guolin Ke committed
120
  for (auto fidx : find_order) {
121
    bool is_filtered_feature = fidx >= num_sample_col;
Guolin Ke's avatar
Guolin Ke committed
122
123
    const data_size_t cur_non_zero_cnt =
        is_filtered_feature ? 0 : num_per_col[fidx];
Guolin Ke's avatar
Guolin Ke committed
124
125
    std::vector<int> available_groups;
    for (int gid = 0; gid < static_cast<int>(features_in_group.size()); ++gid) {
Guolin Ke's avatar
Guolin Ke committed
126
127
128
129
      auto cur_num_bin = group_num_bin[gid] + bin_mappers[fidx]->num_bin() +
                         (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0);
      if (group_total_data_cnt[gid] + cur_non_zero_cnt <=
          total_sample_cnt + single_val_max_conflict_cnt) {
130
        if (!is_use_gpu || cur_num_bin <= max_bin_per_group) {
Guolin Ke's avatar
Guolin Ke committed
131
132
          available_groups.push_back(gid);
        }
Guolin Ke's avatar
Guolin Ke committed
133
134
135
136
137
138
      }
    }
    std::vector<int> search_groups;
    if (!available_groups.empty()) {
      int last = static_cast<int>(available_groups.size()) - 1;
      auto indices = rand.Sample(last, std::min(last, max_search_group - 1));
139
      // always push the last group
Guolin Ke's avatar
Guolin Ke committed
140
141
142
143
144
      search_groups.push_back(available_groups.back());
      for (auto idx : indices) {
        search_groups.push_back(available_groups[idx]);
      }
    }
145
146
    int best_gid = -1;
    int best_conflict_cnt = -1;
Guolin Ke's avatar
Guolin Ke committed
147
    for (auto gid : search_groups) {
Guolin Ke's avatar
Guolin Ke committed
148
149
150
151
152
153
      const data_size_t rest_max_cnt = single_val_max_conflict_cnt -
                                       group_total_data_cnt[gid] +
                                       group_used_row_cnt[gid];
      const data_size_t cnt =
          is_filtered_feature
              ? 0
guanqun's avatar
guanqun committed
154
              : GetConflictCount(conflict_marks[gid], sample_indices[fidx],
Guolin Ke's avatar
Guolin Ke committed
155
                                 num_per_col[fidx], rest_max_cnt);
156
157
158
      if (cnt >= 0 && cnt <= rest_max_cnt && cnt <= cur_non_zero_cnt / 2) {
        best_gid = gid;
        best_conflict_cnt = cnt;
Guolin Ke's avatar
Guolin Ke committed
159
160
161
        break;
      }
    }
162
163
164
165
166
    if (best_gid >= 0) {
      features_in_group[best_gid].push_back(fidx);
      group_total_data_cnt[best_gid] += cur_non_zero_cnt;
      group_used_row_cnt[best_gid] += cur_non_zero_cnt - best_conflict_cnt;
      if (!is_filtered_feature) {
Guolin Ke's avatar
Guolin Ke committed
167
168
        MarkUsed(&conflict_marks[best_gid], sample_indices[fidx],
                 num_per_col[fidx]);
169
      }
Guolin Ke's avatar
Guolin Ke committed
170
171
172
      group_num_bin[best_gid] +=
          bin_mappers[fidx]->num_bin() +
          (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0);
173
    } else {
Guolin Ke's avatar
Guolin Ke committed
174
175
176
      features_in_group.emplace_back();
      features_in_group.back().push_back(fidx);
      conflict_marks.emplace_back(total_sample_cnt, false);
177
      if (!is_filtered_feature) {
Guolin Ke's avatar
Guolin Ke committed
178
179
        MarkUsed(&(conflict_marks.back()), sample_indices[fidx],
                 num_per_col[fidx]);
180
      }
181
182
      group_total_data_cnt.emplace_back(cur_non_zero_cnt);
      group_used_row_cnt.emplace_back(cur_non_zero_cnt);
Guolin Ke's avatar
Guolin Ke committed
183
184
185
      group_num_bin.push_back(
          1 + bin_mappers[fidx]->num_bin() +
          (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0));
186
187
    }
  }
Guolin Ke's avatar
Guolin Ke committed
188
189
190
191
  if (!is_sparse) {
    multi_val_group->resize(features_in_group.size(), false);
    return features_in_group;
  }
192
193
194
195
196
197
  std::vector<int> second_round_features;
  std::vector<std::vector<int>> features_in_group2;
  std::vector<std::vector<bool>> conflict_marks2;

  const double dense_threshold = 0.4;
  for (int gid = 0; gid < static_cast<int>(features_in_group.size()); ++gid) {
Guolin Ke's avatar
Guolin Ke committed
198
199
    const double dense_rate =
        static_cast<double>(group_used_row_cnt[gid]) / total_sample_cnt;
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
    if (dense_rate >= dense_threshold) {
      features_in_group2.push_back(std::move(features_in_group[gid]));
      conflict_marks2.push_back(std::move(conflict_marks[gid]));
    } else {
      for (auto fidx : features_in_group[gid]) {
        second_round_features.push_back(fidx);
      }
    }
  }

  features_in_group = features_in_group2;
  conflict_marks = conflict_marks2;
  multi_val_group->resize(features_in_group.size(), false);
  if (!second_round_features.empty()) {
    features_in_group.emplace_back();
    conflict_marks.emplace_back(total_sample_cnt, false);
    bool is_multi_val = is_use_gpu ? true : false;
    int conflict_cnt = 0;
    for (auto fidx : second_round_features) {
      features_in_group.back().push_back(fidx);
      if (!is_multi_val) {
        const int rest_max_cnt = single_val_max_conflict_cnt - conflict_cnt;
Guolin Ke's avatar
Guolin Ke committed
222
        const auto cnt =
guanqun's avatar
guanqun committed
223
            GetConflictCount(conflict_marks.back(), sample_indices[fidx],
Guolin Ke's avatar
Guolin Ke committed
224
                             num_per_col[fidx], rest_max_cnt);
225
226
227
228
229
        conflict_cnt += cnt;
        if (cnt < 0 || conflict_cnt > single_val_max_conflict_cnt) {
          is_multi_val = true;
          continue;
        }
Guolin Ke's avatar
Guolin Ke committed
230
231
        MarkUsed(&(conflict_marks.back()), sample_indices[fidx],
                 num_per_col[fidx]);
Guolin Ke's avatar
Guolin Ke committed
232
      }
Guolin Ke's avatar
Guolin Ke committed
233
    }
234
    multi_val_group->push_back(is_multi_val);
Guolin Ke's avatar
Guolin Ke committed
235
236
237
238
  }
  return features_in_group;
}

Guolin Ke's avatar
Guolin Ke committed
239
240
241
242
243
244
std::vector<std::vector<int>> FastFeatureBundling(
    const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
    int** sample_indices, double** sample_values, const int* num_per_col,
    int num_sample_col, data_size_t total_sample_cnt,
    const std::vector<int>& used_features, data_size_t num_data,
    bool is_use_gpu, bool is_sparse, std::vector<int8_t>* multi_val_group) {
245
  Common::FunctionTimer fun_timer("Dataset::FastFeatureBundling", global_timer);
Guolin Ke's avatar
Guolin Ke committed
246
  std::vector<size_t> feature_non_zero_cnt;
247
  feature_non_zero_cnt.reserve(used_features.size());
Guolin Ke's avatar
Guolin Ke committed
248
249
  // put dense feature first
  for (auto fidx : used_features) {
250
251
252
253
254
    if (fidx < num_sample_col) {
      feature_non_zero_cnt.emplace_back(num_per_col[fidx]);
    } else {
      feature_non_zero_cnt.emplace_back(0);
    }
Guolin Ke's avatar
Guolin Ke committed
255
256
257
  }
  // sort by non zero cnt
  std::vector<int> sorted_idx;
258
  sorted_idx.reserve(used_features.size());
259
  for (int i = 0; i < static_cast<int>(used_features.size()); ++i) {
Guolin Ke's avatar
Guolin Ke committed
260
261
262
    sorted_idx.emplace_back(i);
  }
  // sort by non zero cnt, bigger first
263
264
  std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
                   [&feature_non_zero_cnt](int a, int b) {
Guolin Ke's avatar
Guolin Ke committed
265
266
                     return feature_non_zero_cnt[a] > feature_non_zero_cnt[b];
                   });
Guolin Ke's avatar
Guolin Ke committed
267
268

  std::vector<int> feature_order_by_cnt;
269
  feature_order_by_cnt.reserve(sorted_idx.size());
Guolin Ke's avatar
Guolin Ke committed
270
271
272
  for (auto sidx : sorted_idx) {
    feature_order_by_cnt.push_back(used_features[sidx]);
  }
273

Guolin Ke's avatar
Guolin Ke committed
274
275
276
277
278
279
  std::vector<std::vector<int>> tmp_indices;
  std::vector<int> tmp_num_per_col(num_sample_col, 0);
  for (auto fidx : used_features) {
    if (fidx >= num_sample_col) {
      continue;
    }
Guolin Ke's avatar
Guolin Ke committed
280
281
282
    auto ret = FixSampleIndices(
        bin_mappers[fidx].get(), static_cast<int>(total_sample_cnt),
        num_per_col[fidx], sample_indices[fidx], sample_values[fidx]);
Guolin Ke's avatar
Guolin Ke committed
283
284
285
286
287
288
289
290
    if (!ret.empty()) {
      tmp_indices.push_back(ret);
      tmp_num_per_col[fidx] = static_cast<int>(ret.size());
      sample_indices[fidx] = tmp_indices.back().data();
    } else {
      tmp_num_per_col[fidx] = num_per_col[fidx];
    }
  }
291
  std::vector<int8_t> group_is_multi_val, group_is_multi_val2;
Guolin Ke's avatar
Guolin Ke committed
292
293
294
295
296
297
298
299
  auto features_in_group =
      FindGroups(bin_mappers, used_features, sample_indices,
                 tmp_num_per_col.data(), num_sample_col, total_sample_cnt,
                 num_data, is_use_gpu, is_sparse, &group_is_multi_val);
  auto group2 =
      FindGroups(bin_mappers, feature_order_by_cnt, sample_indices,
                 tmp_num_per_col.data(), num_sample_col, total_sample_cnt,
                 num_data, is_use_gpu, is_sparse, &group_is_multi_val2);
300

Guolin Ke's avatar
Guolin Ke committed
301
302
  if (features_in_group.size() > group2.size()) {
    features_in_group = group2;
303
    group_is_multi_val = group_is_multi_val2;
Guolin Ke's avatar
Guolin Ke committed
304
305
  }
  // shuffle groups
306
307
  int num_group = static_cast<int>(features_in_group.size());
  Random tmp_rand(num_data);
Guolin Ke's avatar
Guolin Ke committed
308
309
  for (int i = 0; i < num_group - 1; ++i) {
    int j = tmp_rand.NextShort(i + 1, num_group);
310
    std::swap(features_in_group[i], features_in_group[j]);
311
    // Using std::swap for vector<bool> will cause the wrong result.
312
    std::swap(group_is_multi_val[i], group_is_multi_val[j]);
Guolin Ke's avatar
Guolin Ke committed
313
  }
314
315
  *multi_val_group = group_is_multi_val;
  return features_in_group;
Guolin Ke's avatar
Guolin Ke committed
316
317
}

Guolin Ke's avatar
Guolin Ke committed
318
319
320
321
322
323
void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
                        int num_total_features,
                        const std::vector<std::vector<double>>& forced_bins,
                        int** sample_non_zero_indices, double** sample_values,
                        const int* num_per_col, int num_sample_col,
                        size_t total_sample_cnt, const Config& io_config) {
324
  num_total_features_ = num_total_features;
Nikita Titov's avatar
Nikita Titov committed
325
  CHECK_EQ(num_total_features_, static_cast<int>(bin_mappers->size()));
Guolin Ke's avatar
Guolin Ke committed
326
327
  // get num_features
  std::vector<int> used_features;
Guolin Ke's avatar
Guolin Ke committed
328
  auto& ref_bin_mappers = *bin_mappers;
Guolin Ke's avatar
Guolin Ke committed
329
  for (int i = 0; i < static_cast<int>(bin_mappers->size()); ++i) {
Guolin Ke's avatar
Guolin Ke committed
330
    if (ref_bin_mappers[i] != nullptr && !ref_bin_mappers[i]->is_trivial()) {
Guolin Ke's avatar
Guolin Ke committed
331
      used_features.emplace_back(i);
Guolin Ke's avatar
Guolin Ke committed
332
    }
Guolin Ke's avatar
Guolin Ke committed
333
  }
Guolin Ke's avatar
Guolin Ke committed
334
  if (used_features.empty()) {
Guolin Ke's avatar
Guolin Ke committed
335
    Log::Warning(
336
337
338
        "There are no meaningful features which satisfy the provided configuration. "
        "Decreasing Dataset parameters min_data_in_bin or min_data_in_leaf and re-constructing "
        "Dataset might resolve this warning.");
Guolin Ke's avatar
Guolin Ke committed
339
  }
Guolin Ke's avatar
Guolin Ke committed
340
  auto features_in_group = NoGroup(used_features);
341
342

  auto is_sparse = io_config.is_enable_sparse;
343
  if (io_config.device_type == std::string("cuda") || io_config.device_type == std::string("cuda_exp")) {
344
      LGBM_config_::current_device = lgbm_device_cuda;
345
      if (io_config.device_type == std::string("cuda") && is_sparse) {
346
        Log::Warning("Using sparse features with CUDA is currently not supported.");
347
        is_sparse = false;
348
349
350
      }
  }

351
  std::vector<int8_t> group_is_multi_val(used_features.size(), 0);
352
  if (io_config.enable_bundle && !used_features.empty()) {
353
354
    bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda")
      || io_config.device_type == std::string("cuda_exp");
Guolin Ke's avatar
Guolin Ke committed
355
356
357
    features_in_group = FastFeatureBundling(
        *bin_mappers, sample_non_zero_indices, sample_values, num_per_col,
        num_sample_col, static_cast<data_size_t>(total_sample_cnt),
358
359
        used_features, num_data_, lgbm_is_gpu_used,
        is_sparse, &group_is_multi_val);
Guolin Ke's avatar
Guolin Ke committed
360
361
  }

Guolin Ke's avatar
Guolin Ke committed
362
363
364
365
366
367
368
369
370
371
  num_features_ = 0;
  for (const auto& fs : features_in_group) {
    num_features_ += static_cast<int>(fs.size());
  }
  int cur_fidx = 0;
  used_feature_map_ = std::vector<int>(num_total_features_, -1);
  num_groups_ = static_cast<int>(features_in_group.size());
  real_feature_idx_.resize(num_features_);
  feature2group_.resize(num_features_);
  feature2subfeature_.resize(num_features_);
Guolin Ke's avatar
Guolin Ke committed
372
  feature_need_push_zeros_.clear();
Guolin Ke's avatar
Guolin Ke committed
373
374
375
376
377
  group_bin_boundaries_.clear();
  uint64_t num_total_bin = 0;
  group_bin_boundaries_.push_back(num_total_bin);
  group_feature_start_.resize(num_groups_);
  group_feature_cnt_.resize(num_groups_);
Guolin Ke's avatar
Guolin Ke committed
378
379
380
  for (int i = 0; i < num_groups_; ++i) {
    auto cur_features = features_in_group[i];
    int cur_cnt_features = static_cast<int>(cur_features.size());
Guolin Ke's avatar
Guolin Ke committed
381
382
    group_feature_start_[i] = cur_fidx;
    group_feature_cnt_[i] = cur_cnt_features;
Guolin Ke's avatar
Guolin Ke committed
383
384
385
386
387
388
389
390
    // get bin_mappers
    std::vector<std::unique_ptr<BinMapper>> cur_bin_mappers;
    for (int j = 0; j < cur_cnt_features; ++j) {
      int real_fidx = cur_features[j];
      used_feature_map_[real_fidx] = cur_fidx;
      real_feature_idx_[cur_fidx] = real_fidx;
      feature2group_[cur_fidx] = i;
      feature2subfeature_[cur_fidx] = j;
Guolin Ke's avatar
Guolin Ke committed
391
      cur_bin_mappers.emplace_back(ref_bin_mappers[real_fidx].release());
Guolin Ke's avatar
Guolin Ke committed
392
393
      if (cur_bin_mappers.back()->GetDefaultBin() !=
          cur_bin_mappers.back()->GetMostFreqBin()) {
Guolin Ke's avatar
Guolin Ke committed
394
395
        feature_need_push_zeros_.push_back(cur_fidx);
      }
Guolin Ke's avatar
Guolin Ke committed
396
397
      ++cur_fidx;
    }
Guolin Ke's avatar
Guolin Ke committed
398
    feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(
399
      new FeatureGroup(cur_cnt_features, group_is_multi_val[i], &cur_bin_mappers, num_data_, i)));
Guolin Ke's avatar
Guolin Ke committed
400
401
402
    num_total_bin += feature_groups_[i]->num_total_bin_;
    group_bin_boundaries_.push_back(num_total_bin);
  }
Belinda Trotta's avatar
Belinda Trotta committed
403
  if (!io_config.max_bin_by_feature.empty()) {
404
405
406
407
    CHECK_EQ(static_cast<size_t>(num_total_features_),
             io_config.max_bin_by_feature.size());
    CHECK_GT(*(std::min_element(io_config.max_bin_by_feature.begin(),
                                io_config.max_bin_by_feature.end())), 1);
Belinda Trotta's avatar
Belinda Trotta committed
408
    max_bin_by_feature_.resize(num_total_features_);
Guolin Ke's avatar
Guolin Ke committed
409
410
    max_bin_by_feature_.assign(io_config.max_bin_by_feature.begin(),
                               io_config.max_bin_by_feature.end());
Belinda Trotta's avatar
Belinda Trotta committed
411
  }
412
  forced_bin_bounds_ = forced_bins;
413
414
415
416
417
  max_bin_ = io_config.max_bin;
  min_data_in_bin_ = io_config.min_data_in_bin;
  bin_construct_sample_cnt_ = io_config.bin_construct_sample_cnt;
  use_missing_ = io_config.use_missing;
  zero_as_missing_ = io_config.zero_as_missing;
418
419
420
421
422
423
424
425
426
427
428
429
  has_raw_ = false;
  if (io_config.linear_tree) {
    has_raw_ = true;
  }
  numeric_feature_map_ = std::vector<int>(num_features_, -1);
  num_numeric_features_ = 0;
  for (int i = 0; i < num_features_; ++i) {
    if (FeatureBinMapper(i)->bin_type() == BinType::NumericalBin) {
      numeric_feature_map_[i] = num_numeric_features_;
      ++num_numeric_features_;
    }
  }
430
431
  device_type_ = io_config.device_type;
  gpu_device_id_ = io_config.gpu_device_id;
432
433
}

Guolin Ke's avatar
Guolin Ke committed
434
void Dataset::FinishLoad() {
Guolin Ke's avatar
Guolin Ke committed
435
436
437
  if (is_finish_load_) {
    return;
  }
438
439
  if (num_groups_ > 0) {
    for (int i = 0; i < num_groups_; ++i) {
440
      feature_groups_[i]->FinishLoad();
441
    }
Guolin Ke's avatar
Guolin Ke committed
442
  }
443
444
445
446
447
448
449
450
  #ifdef USE_CUDA_EXP
  if (device_type_ == std::string("cuda_exp")) {
    CreateCUDAColumnData();
    metadata_.CreateCUDAMetadata(gpu_device_id_);
  } else {
    cuda_column_data_.reset(nullptr);
  }
  #endif  // USE_CUDA_EXP
Guolin Ke's avatar
Guolin Ke committed
451
  is_finish_load_ = true;
Guolin Ke's avatar
Guolin Ke committed
452
}
Guolin Ke's avatar
Guolin Ke committed
453

454
void PushDataToMultiValBin(
Guolin Ke's avatar
Guolin Ke committed
455
    data_size_t num_data, const std::vector<uint32_t> most_freq_bins,
456
    const std::vector<uint32_t> offsets,
Guolin Ke's avatar
Guolin Ke committed
457
    std::vector<std::vector<std::unique_ptr<BinIterator>>>* iters,
458
459
460
    MultiValBin* ret) {
  Common::FunctionTimer fun_time("Dataset::PushDataToMultiValBin",
                                 global_timer);
461
  if (ret->IsSparse()) {
Guolin Ke's avatar
Guolin Ke committed
462
463
464
465
466
    Threading::For<data_size_t>(
        0, num_data, 1024, [&](int tid, data_size_t start, data_size_t end) {
          std::vector<uint32_t> cur_data;
          cur_data.reserve(most_freq_bins.size());
          for (size_t j = 0; j < most_freq_bins.size(); ++j) {
Guolin Ke's avatar
Guolin Ke committed
467
            (*iters)[tid][j]->Reset(start);
468
          }
Guolin Ke's avatar
Guolin Ke committed
469
470
471
          for (data_size_t i = start; i < end; ++i) {
            cur_data.clear();
            for (size_t j = 0; j < most_freq_bins.size(); ++j) {
472
              // for sparse multi value bin, we store the feature bin values with offset added
Guolin Ke's avatar
Guolin Ke committed
473
              auto cur_bin = (*iters)[tid][j]->Get(i);
Guolin Ke's avatar
Guolin Ke committed
474
475
476
477
478
479
480
481
482
483
              if (cur_bin == most_freq_bins[j]) {
                continue;
              }
              cur_bin += offsets[j];
              if (most_freq_bins[j] == 0) {
                cur_bin -= 1;
              }
              cur_data.push_back(cur_bin);
            }
            ret->PushOneRow(tid, i, cur_data);
484
          }
Guolin Ke's avatar
Guolin Ke committed
485
        });
486
  } else {
Guolin Ke's avatar
Guolin Ke committed
487
488
489
490
    Threading::For<data_size_t>(
        0, num_data, 1024, [&](int tid, data_size_t start, data_size_t end) {
          std::vector<uint32_t> cur_data(most_freq_bins.size(), 0);
          for (size_t j = 0; j < most_freq_bins.size(); ++j) {
Guolin Ke's avatar
Guolin Ke committed
491
            (*iters)[tid][j]->Reset(start);
Guolin Ke's avatar
Guolin Ke committed
492
493
494
          }
          for (data_size_t i = start; i < end; ++i) {
            for (size_t j = 0; j < most_freq_bins.size(); ++j) {
495
              // for dense multi value bin, the feature bin values without offsets are used
Guolin Ke's avatar
Guolin Ke committed
496
              auto cur_bin = (*iters)[tid][j]->Get(i);
Guolin Ke's avatar
Guolin Ke committed
497
              cur_data[j] = cur_bin;
498
            }
Guolin Ke's avatar
Guolin Ke committed
499
            ret->PushOneRow(tid, i, cur_data);
500
          }
Guolin Ke's avatar
Guolin Ke committed
501
        });
502
503
504
  }
}

505
MultiValBin* Dataset::GetMultiBinFromSparseFeatures(const std::vector<uint32_t>& offsets) const {
506
507
  Common::FunctionTimer fun_time("Dataset::GetMultiBinFromSparseFeatures",
                                 global_timer);
508
509
510
511
512
513
514
515
516
517
518
519
520
521
  int multi_group_id = -1;
  for (int i = 0; i < num_groups_; ++i) {
    if (feature_groups_[i]->is_multi_val_) {
      if (multi_group_id < 0) {
        multi_group_id = i;
      } else {
        Log::Fatal("Bug. There should be only one multi-val group.");
      }
    }
  }
  if (multi_group_id < 0) {
    return nullptr;
  }
  const int num_feature = feature_groups_[multi_group_id]->num_feature_;
522
  int num_threads = OMP_NUM_THREADS();
523
524
525
526
527

  std::vector<std::vector<std::unique_ptr<BinIterator>>> iters(num_threads);
  std::vector<uint32_t> most_freq_bins;
  double sum_sparse_rate = 0;
  for (int i = 0; i < num_feature; ++i) {
528
#pragma omp parallel for schedule(static, 1)
529
    for (int tid = 0; tid < num_threads; ++tid) {
530
531
      iters[tid].emplace_back(
          feature_groups_[multi_group_id]->SubFeatureIterator(i));
532
    }
533
534
535
536
    most_freq_bins.push_back(
        feature_groups_[multi_group_id]->bin_mappers_[i]->GetMostFreqBin());
    sum_sparse_rate +=
        feature_groups_[multi_group_id]->bin_mappers_[i]->sparse_rate();
537
538
  }
  sum_sparse_rate /= num_feature;
539
540
  Log::Debug("Dataset::GetMultiBinFromSparseFeatures: sparse rate %f",
             sum_sparse_rate);
541
  std::unique_ptr<MultiValBin> ret;
542
  ret.reset(MultiValBin::CreateMultiValBin(num_data_, offsets.back(),
543
                                           num_feature, sum_sparse_rate, offsets));
Guolin Ke's avatar
Guolin Ke committed
544
  PushDataToMultiValBin(num_data_, most_freq_bins, offsets, &iters, ret.get());
545
546
547
548
  ret->FinishLoad();
  return ret.release();
}

549
MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector<uint32_t>& offsets) const {
550
551
  Common::FunctionTimer fun_time("Dataset::GetMultiBinFromAllFeatures",
                                 global_timer);
552
  int num_threads = OMP_NUM_THREADS();
553
554
555
556
557
  double sum_dense_ratio = 0;

  std::unique_ptr<MultiValBin> ret;
  std::vector<std::vector<std::unique_ptr<BinIterator>>> iters(num_threads);
  std::vector<uint32_t> most_freq_bins;
558
559
560
561
562
563
564
565
566
567
568
569
570
  int ncol = 0;
  for (int gid = 0; gid < num_groups_; ++gid) {
    if (feature_groups_[gid]->is_multi_val_) {
      ncol += feature_groups_[gid]->num_feature_;
    } else {
      ++ncol;
    }
    for (int fid = 0; fid < feature_groups_[gid]->num_feature_; ++fid) {
      const auto& bin_mapper = feature_groups_[gid]->bin_mappers_[fid];
      sum_dense_ratio += 1.0f - bin_mapper->sparse_rate();
    }
  }
  sum_dense_ratio /= ncol;
571
572
573
574
575
  for (int gid = 0; gid < num_groups_; ++gid) {
    if (feature_groups_[gid]->is_multi_val_) {
      for (int fid = 0; fid < feature_groups_[gid]->num_feature_; ++fid) {
        const auto& bin_mapper = feature_groups_[gid]->bin_mappers_[fid];
        most_freq_bins.push_back(bin_mapper->GetMostFreqBin());
576
#pragma omp parallel for schedule(static, 1)
577
        for (int tid = 0; tid < num_threads; ++tid) {
578
579
          iters[tid].emplace_back(
              feature_groups_[gid]->SubFeatureIterator(fid));
580
581
582
583
584
585
586
587
588
        }
      }
    } else {
      most_freq_bins.push_back(0);
      for (int tid = 0; tid < num_threads; ++tid) {
        iters[tid].emplace_back(feature_groups_[gid]->FeatureGroupIterator());
      }
    }
  }
589
  CHECK(static_cast<int>(most_freq_bins.size()) == ncol);
590
591
592
  Log::Debug("Dataset::GetMultiBinFromAllFeatures: sparse rate %f",
             1.0 - sum_dense_ratio);
  ret.reset(MultiValBin::CreateMultiValBin(
593
      num_data_, offsets.back(), static_cast<int>(most_freq_bins.size()),
594
      1.0 - sum_dense_ratio, offsets));
Guolin Ke's avatar
Guolin Ke committed
595
  PushDataToMultiValBin(num_data_, most_freq_bins, offsets, &iters, ret.get());
596
597
598
599
  ret->FinishLoad();
  return ret.release();
}

600
TrainingShareStates* Dataset::GetShareStates(
601
602
    score_t* gradients, score_t* hessians,
    const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
603
    bool force_col_wise, bool force_row_wise) const {
604
605
  Common::FunctionTimer fun_timer("Dataset::TestMultiThreadingMethod",
                                  global_timer);
606
  if (force_col_wise && force_row_wise) {
607
    Log::Fatal(
608
        "Cannot set both of `force_col_wise` and `force_row_wise` to `true` at "
609
        "the same time");
610
611
  }
  if (num_groups_ <= 0) {
612
    TrainingShareStates* share_state = new TrainingShareStates();
613
    share_state->is_col_wise = true;
614
615
    share_state->is_constant_hessian = is_constant_hessian;
    return share_state;
616
  }
617
  if (force_col_wise) {
618
    TrainingShareStates* share_state = new TrainingShareStates();
619
620
621
622
623
624
    std::vector<uint32_t> offsets;
    share_state->CalcBinOffsets(
      feature_groups_, &offsets, true);
    share_state->SetMultiValBin(GetMultiBinFromSparseFeatures(offsets),
      num_data_, feature_groups_, false, true);
    share_state->is_col_wise = true;
625
626
    share_state->is_constant_hessian = is_constant_hessian;
    return share_state;
627
  } else if (force_row_wise) {
628
    TrainingShareStates* share_state = new TrainingShareStates();
629
630
631
632
633
634
    std::vector<uint32_t> offsets;
    share_state->CalcBinOffsets(
      feature_groups_, &offsets, false);
    share_state->SetMultiValBin(GetMultiBinFromAllFeatures(offsets), num_data_,
      feature_groups_, false, false);
    share_state->is_col_wise = false;
635
636
    share_state->is_constant_hessian = is_constant_hessian;
    return share_state;
637
638
639
  } else {
    std::unique_ptr<MultiValBin> sparse_bin;
    std::unique_ptr<MultiValBin> all_bin;
640
641
642
643
    std::unique_ptr<TrainingShareStates> col_wise_state;
    std::unique_ptr<TrainingShareStates> row_wise_state;
    col_wise_state.reset(new TrainingShareStates());
    row_wise_state.reset(new TrainingShareStates());
644

645
    std::chrono::duration<double, std::milli> col_wise_init_time, row_wise_init_time;
646
    auto start_time = std::chrono::steady_clock::now();
647
648
649
650
    std::vector<uint32_t> col_wise_offsets;
    col_wise_state->CalcBinOffsets(feature_groups_, &col_wise_offsets, true);
    col_wise_state->SetMultiValBin(GetMultiBinFromSparseFeatures(col_wise_offsets), num_data_,
      feature_groups_, false, true);
651
    col_wise_init_time = std::chrono::steady_clock::now() - start_time;
652

653
    start_time = std::chrono::steady_clock::now();
654
655
656
657
658
659
660
661
    std::vector<uint32_t> row_wise_offsets;
    row_wise_state->CalcBinOffsets(feature_groups_, &row_wise_offsets, false);
    row_wise_state->SetMultiValBin(GetMultiBinFromAllFeatures(row_wise_offsets), num_data_,
      feature_groups_, false, false);
    row_wise_init_time = std::chrono::steady_clock::now() - start_time;

    uint64_t max_total_bin = std::max<uint64_t>(row_wise_state->num_hist_total_bin(),
      col_wise_state->num_hist_total_bin());
662
    std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>
663
        hist_data(max_total_bin * 2);
664
665

    Log::Debug(
666
667
668
669
670
671
672
673
674
      "init for col-wise cost %f seconds, init for row-wise cost %f seconds",
      col_wise_init_time * 1e-3, row_wise_init_time * 1e-3);

    col_wise_state->is_col_wise = true;
    col_wise_state->is_constant_hessian = is_constant_hessian;
    InitTrain(is_feature_used, col_wise_state.get());
    row_wise_state->is_col_wise = false;
    row_wise_state->is_constant_hessian = is_constant_hessian;
    InitTrain(is_feature_used, row_wise_state.get());
675
    std::chrono::duration<double, std::milli> col_wise_time, row_wise_time;
676
    start_time = std::chrono::steady_clock::now();
677
    ConstructHistograms(is_feature_used, nullptr, num_data_, gradients,
678
                        hessians, gradients, hessians, col_wise_state.get(),
679
                        hist_data.data());
680
681
    col_wise_time = std::chrono::steady_clock::now() - start_time;
    start_time = std::chrono::steady_clock::now();
Guolin Ke's avatar
Guolin Ke committed
682
    ConstructHistograms(is_feature_used, nullptr, num_data_, gradients,
683
                        hessians, gradients, hessians, row_wise_state.get(),
Guolin Ke's avatar
Guolin Ke committed
684
                        hist_data.data());
685
    row_wise_time = std::chrono::steady_clock::now() - start_time;
686

687
    if (col_wise_time < row_wise_time) {
688
689
      auto overhead_cost = row_wise_init_time + row_wise_time + col_wise_time;
      Log::Warning(
690
          "Auto-choosing col-wise multi-threading, the overhead of testing was "
Nikita Titov's avatar
Nikita Titov committed
691
692
          "%f seconds.\n"
          "You can set `force_col_wise=true` to remove the overhead.",
693
          overhead_cost * 1e-3);
694
      return col_wise_state.release();
695
    } else {
696
697
      auto overhead_cost = col_wise_init_time + row_wise_time + col_wise_time;
      Log::Warning(
698
          "Auto-choosing row-wise multi-threading, the overhead of testing was "
Nikita Titov's avatar
Nikita Titov committed
699
700
701
          "%f seconds.\n"
          "You can set `force_row_wise=true` to remove the overhead.\n"
          "And if memory is not enough, you can set `force_col_wise=true`.",
702
          overhead_cost * 1e-3);
703
      if (row_wise_state->IsSparseRowwise()) {
704
        Log::Debug("Using Sparse Multi-Val Bin");
705
      } else {
706
        Log::Debug("Using Dense Multi-Val Bin");
707
      }
708
      return row_wise_state.release();
709
710
711
712
    }
  }
}

713
void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
Guolin Ke's avatar
Guolin Ke committed
714
  feature_groups_.clear();
Guolin Ke's avatar
Guolin Ke committed
715
  num_features_ = dataset->num_features_;
Guolin Ke's avatar
Guolin Ke committed
716
  num_groups_ = dataset->num_groups_;
717
  has_raw_ = dataset->has_raw();
Guolin Ke's avatar
Guolin Ke committed
718
  // copy feature bin mapper data
Guolin Ke's avatar
Guolin Ke committed
719
  for (int i = 0; i < num_groups_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
720
721
    feature_groups_.emplace_back(
        new FeatureGroup(*dataset->feature_groups_[i], num_data_));
Guolin Ke's avatar
Guolin Ke committed
722
  }
Guolin Ke's avatar
Guolin Ke committed
723
  feature_groups_.shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
724
725
726
  used_feature_map_ = dataset->used_feature_map_;
  num_total_features_ = dataset->num_total_features_;
  feature_names_ = dataset->feature_names_;
Guolin Ke's avatar
Guolin Ke committed
727
  label_idx_ = dataset->label_idx_;
Guolin Ke's avatar
Guolin Ke committed
728
729
730
731
732
733
  real_feature_idx_ = dataset->real_feature_idx_;
  feature2group_ = dataset->feature2group_;
  feature2subfeature_ = dataset->feature2subfeature_;
  group_bin_boundaries_ = dataset->group_bin_boundaries_;
  group_feature_start_ = dataset->group_feature_start_;
  group_feature_cnt_ = dataset->group_feature_cnt_;
734
  forced_bin_bounds_ = dataset->forced_bin_bounds_;
Guolin Ke's avatar
Guolin Ke committed
735
  feature_need_push_zeros_ = dataset->feature_need_push_zeros_;
Guolin Ke's avatar
Guolin Ke committed
736
737
738
739
740
741
}

void Dataset::CreateValid(const Dataset* dataset) {
  feature_groups_.clear();
  num_features_ = dataset->num_features_;
  num_groups_ = num_features_;
742
743
744
745
746
  max_bin_ = dataset->max_bin_;
  min_data_in_bin_ = dataset->min_data_in_bin_;
  bin_construct_sample_cnt_ = dataset->bin_construct_sample_cnt_;
  use_missing_ = dataset->use_missing_;
  zero_as_missing_ = dataset->zero_as_missing_;
Guolin Ke's avatar
Guolin Ke committed
747
748
  feature2group_.clear();
  feature2subfeature_.clear();
749
750
751
752
  has_raw_ = dataset->has_raw();
  numeric_feature_map_ = dataset->numeric_feature_map_;
  num_numeric_features_ = dataset->num_numeric_features_;
  // copy feature bin mapper data
Guolin Ke's avatar
Guolin Ke committed
753
  feature_need_push_zeros_.clear();
Guolin Ke's avatar
Guolin Ke committed
754
755
756
757
758
  group_bin_boundaries_.clear();
  uint64_t num_total_bin = 0;
  group_bin_boundaries_.push_back(num_total_bin);
  group_feature_start_.resize(num_groups_);
  group_feature_cnt_.resize(num_groups_);
Guolin Ke's avatar
Guolin Ke committed
759
760
761
  for (int i = 0; i < num_features_; ++i) {
    std::vector<std::unique_ptr<BinMapper>> bin_mappers;
    bin_mappers.emplace_back(new BinMapper(*(dataset->FeatureBinMapper(i))));
Guolin Ke's avatar
Guolin Ke committed
762
763
    if (bin_mappers.back()->GetDefaultBin() !=
        bin_mappers.back()->GetMostFreqBin()) {
Guolin Ke's avatar
Guolin Ke committed
764
765
      feature_need_push_zeros_.push_back(i);
    }
766
    feature_groups_.emplace_back(new FeatureGroup(&bin_mappers, num_data_));
Guolin Ke's avatar
Guolin Ke committed
767
768
    feature2group_.push_back(i);
    feature2subfeature_.push_back(0);
Guolin Ke's avatar
Guolin Ke committed
769
770
771
772
    num_total_bin += feature_groups_[i]->num_total_bin_;
    group_bin_boundaries_.push_back(num_total_bin);
    group_feature_start_[i] = i;
    group_feature_cnt_[i] = 1;
Guolin Ke's avatar
Guolin Ke committed
773
774
775
776
777
778
779
780
  }

  feature_groups_.shrink_to_fit();
  used_feature_map_ = dataset->used_feature_map_;
  num_total_features_ = dataset->num_total_features_;
  feature_names_ = dataset->feature_names_;
  label_idx_ = dataset->label_idx_;
  real_feature_idx_ = dataset->real_feature_idx_;
781
  forced_bin_bounds_ = dataset->forced_bin_bounds_;
782
783
  device_type_ = dataset->device_type_;
  gpu_device_id_ = dataset->gpu_device_id_;
Guolin Ke's avatar
Guolin Ke committed
784
785
}

Guolin Ke's avatar
Guolin Ke committed
786
787
788
void Dataset::ReSize(data_size_t num_data) {
  if (num_data_ != num_data) {
    num_data_ = num_data;
789
    OMP_INIT_EX();
Guolin Ke's avatar
Guolin Ke committed
790
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
791
    for (int group = 0; group < num_groups_; ++group) {
792
      OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
793
      feature_groups_[group]->ReSize(num_data_);
794
      OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
795
    }
796
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
797
798
799
  }
}

800
void Dataset::CopySubrow(const Dataset* fullset,
Guolin Ke's avatar
Guolin Ke committed
801
802
                         const data_size_t* used_indices,
                         data_size_t num_used_indices, bool need_meta_data) {
Nikita Titov's avatar
Nikita Titov committed
803
  CHECK_EQ(num_used_indices, num_data_);
804
805
806
807

  std::vector<int> group_ids, subfeature_ids;
  group_ids.reserve(num_features_);
  subfeature_ids.reserve(num_features_);
Guolin Ke's avatar
Guolin Ke committed
808
  for (int group = 0; group < num_groups_; ++group) {
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
    if (fullset->IsMultiGroup(group)) {
      for (int sub_feature = 0; sub_feature <
          fullset->feature_groups_[group]->num_feature_; ++sub_feature) {
        group_ids.emplace_back(group);
        subfeature_ids.emplace_back(sub_feature);
      }
    } else {
      group_ids.emplace_back(group);
      subfeature_ids.emplace_back(-1);
    }
  }
  int num_copy_tasks = static_cast<int>(group_ids.size());

  OMP_INIT_EX();
  #pragma omp parallel for schedule(dynamic)
  for (int task_id = 0; task_id < num_copy_tasks; ++task_id) {
825
    OMP_LOOP_EX_BEGIN();
826
827
828
829
    int group = group_ids[task_id];
    int subfeature = subfeature_ids[task_id];
    feature_groups_[group]->CopySubrowByCol(fullset->feature_groups_[group].get(),
                                            used_indices, num_used_indices, subfeature);
830
    OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
831
  }
832
  OMP_THROW_EX();
833

Guolin Ke's avatar
Guolin Ke committed
834
  if (need_meta_data) {
Guolin Ke's avatar
Guolin Ke committed
835
    metadata_.Init(fullset->metadata_, used_indices, num_used_indices);
Guolin Ke's avatar
Guolin Ke committed
836
  }
Guolin Ke's avatar
Guolin Ke committed
837
  is_finish_load_ = true;
838
839
840
841
842
843
844
845
846
847
848
  numeric_feature_map_ = fullset->numeric_feature_map_;
  num_numeric_features_ = fullset->num_numeric_features_;
  if (has_raw_) {
    ResizeRaw(num_used_indices);
#pragma omp parallel for schedule(static)
    for (int i = 0; i < num_used_indices; ++i) {
      for (int j = 0; j < num_numeric_features_; ++j) {
        raw_data_[j][i] = fullset->raw_data_[j][used_indices[i]];
      }
    }
  }
849
850
851
852
853
854
855
856
857
858
859
860
861
  // update CUDA storage for column data and metadata
  device_type_ = fullset->device_type_;
  gpu_device_id_ = fullset->gpu_device_id_;

  #ifdef USE_CUDA_EXP
  if (device_type_ == std::string("cuda_exp")) {
    if (cuda_column_data_ == nullptr) {
      cuda_column_data_.reset(new CUDAColumnData(fullset->num_data(), gpu_device_id_));
      metadata_.CreateCUDAMetadata(gpu_device_id_);
    }
    cuda_column_data_->CopySubrow(fullset->cuda_column_data(), used_indices, num_used_indices);
  }
  #endif  // USE_CUDA_EXP
Guolin Ke's avatar
Guolin Ke committed
862
863
}

Guolin Ke's avatar
Guolin Ke committed
864
865
bool Dataset::SetFloatField(const char* field_name, const float* field_data,
                            data_size_t num_element) {
Guolin Ke's avatar
Guolin Ke committed
866
867
868
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("label") || name == std::string("target")) {
Guolin Ke's avatar
Guolin Ke committed
869
#ifdef LABEL_T_USE_DOUBLE
870
    Log::Fatal("Don't support LABEL_T_USE_DOUBLE");
Guolin Ke's avatar
Guolin Ke committed
871
#else
872
    metadata_.SetLabel(field_data, num_element);
Guolin Ke's avatar
Guolin Ke committed
873
#endif
Guolin Ke's avatar
Guolin Ke committed
874
  } else if (name == std::string("weight") || name == std::string("weights")) {
Guolin Ke's avatar
Guolin Ke committed
875
#ifdef LABEL_T_USE_DOUBLE
876
    Log::Fatal("Don't support LABEL_T_USE_DOUBLE");
Guolin Ke's avatar
Guolin Ke committed
877
#else
878
    metadata_.SetWeights(field_data, num_element);
Guolin Ke's avatar
Guolin Ke committed
879
#endif
Guolin Ke's avatar
Guolin Ke committed
880
881
882
883
884
885
  } else {
    return false;
  }
  return true;
}

Guolin Ke's avatar
Guolin Ke committed
886
887
bool Dataset::SetDoubleField(const char* field_name, const double* field_data,
                             data_size_t num_element) {
Guolin Ke's avatar
Guolin Ke committed
888
889
890
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("init_score")) {
891
    metadata_.SetInitScore(field_data, num_element);
Guolin Ke's avatar
Guolin Ke committed
892
  } else {
893
    return false;
Guolin Ke's avatar
Guolin Ke committed
894
  }
895
  return true;
Guolin Ke's avatar
Guolin Ke committed
896
897
}

Guolin Ke's avatar
Guolin Ke committed
898
899
bool Dataset::SetIntField(const char* field_name, const int* field_data,
                          data_size_t num_element) {
900
901
902
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("query") || name == std::string("group")) {
Guolin Ke's avatar
Guolin Ke committed
903
    metadata_.SetQuery(field_data, num_element);
904
905
906
907
908
909
  } else {
    return false;
  }
  return true;
}

Guolin Ke's avatar
Guolin Ke committed
910
911
bool Dataset::GetFloatField(const char* field_name, data_size_t* out_len,
                            const float** out_ptr) {
912
913
914
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("label") || name == std::string("target")) {
Guolin Ke's avatar
Guolin Ke committed
915
#ifdef LABEL_T_USE_DOUBLE
916
    Log::Fatal("Don't support LABEL_T_USE_DOUBLE");
Guolin Ke's avatar
Guolin Ke committed
917
#else
918
919
    *out_ptr = metadata_.label();
    *out_len = num_data_;
Guolin Ke's avatar
Guolin Ke committed
920
#endif
921
  } else if (name == std::string("weight") || name == std::string("weights")) {
Guolin Ke's avatar
Guolin Ke committed
922
#ifdef LABEL_T_USE_DOUBLE
923
    Log::Fatal("Don't support LABEL_T_USE_DOUBLE");
Guolin Ke's avatar
Guolin Ke committed
924
#else
925
926
    *out_ptr = metadata_.weights();
    *out_len = num_data_;
Guolin Ke's avatar
Guolin Ke committed
927
#endif
Guolin Ke's avatar
Guolin Ke committed
928
929
930
931
932
933
  } else {
    return false;
  }
  return true;
}

Guolin Ke's avatar
Guolin Ke committed
934
935
bool Dataset::GetDoubleField(const char* field_name, data_size_t* out_len,
                             const double** out_ptr) {
Guolin Ke's avatar
Guolin Ke committed
936
937
938
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("init_score")) {
939
    *out_ptr = metadata_.init_score();
Guolin Ke's avatar
Guolin Ke committed
940
    *out_len = static_cast<data_size_t>(metadata_.num_init_score());
941
  } else {
942
943
    return false;
  }
944
  return true;
945
946
}

Guolin Ke's avatar
Guolin Ke committed
947
948
bool Dataset::GetIntField(const char* field_name, data_size_t* out_len,
                          const int** out_ptr) {
949
950
951
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("query") || name == std::string("group")) {
952
    *out_ptr = metadata_.query_boundaries();
Guolin Ke's avatar
Guolin Ke committed
953
    *out_len = metadata_.num_queries() + 1;
Guolin Ke's avatar
Guolin Ke committed
954
955
956
  } else {
    return false;
  }
957
  return true;
958
959
}

Guolin Ke's avatar
Guolin Ke committed
960
void Dataset::SaveBinaryFile(const char* bin_filename) {
Guolin Ke's avatar
Guolin Ke committed
961
  if (bin_filename != nullptr && std::string(bin_filename) == data_filename_) {
962
    Log::Warning("Binary file %s already exists", bin_filename);
Guolin Ke's avatar
Guolin Ke committed
963
964
    return;
  }
Guolin Ke's avatar
Guolin Ke committed
965
  // if not pass a filename, just append ".bin" of original file
Guolin Ke's avatar
Guolin Ke committed
966
  std::string bin_filename_str(data_filename_);
Guolin Ke's avatar
Guolin Ke committed
967
968
969
970
  if (bin_filename == nullptr || bin_filename[0] == '\0') {
    bin_filename_str.append(".bin");
    bin_filename = bin_filename_str.c_str();
  }
Guolin Ke's avatar
Guolin Ke committed
971
  bool is_file_existed = false;
972
973

  if (VirtualFileWriter::Exists(bin_filename)) {
Guolin Ke's avatar
Guolin Ke committed
974
    is_file_existed = true;
975
    Log::Warning("File %s exists, cannot save binary to it", bin_filename);
Guolin Ke's avatar
Guolin Ke committed
976
  }
Guolin Ke's avatar
Guolin Ke committed
977

Guolin Ke's avatar
Guolin Ke committed
978
  if (!is_file_existed) {
979
980
    auto writer = VirtualFileWriter::Make(bin_filename);
    if (!writer->Init()) {
Guolin Ke's avatar
Guolin Ke committed
981
      Log::Fatal("Cannot write binary data to %s ", bin_filename);
Guolin Ke's avatar
Guolin Ke committed
982
    }
983
    Log::Info("Saving data to binary file %s", bin_filename);
984
    size_t size_of_token = std::strlen(binary_file_token);
985
    writer->AlignedWrite(binary_file_token, size_of_token);
Guolin Ke's avatar
Guolin Ke committed
986
    // get size of header
987
988
989
990
991
992
993
994
995
996
997
998
    size_t size_of_header =
        VirtualFileWriter::AlignedSize(sizeof(num_data_)) +
        VirtualFileWriter::AlignedSize(sizeof(num_features_)) +
        VirtualFileWriter::AlignedSize(sizeof(num_total_features_)) +
        VirtualFileWriter::AlignedSize(sizeof(int) * num_total_features_) +
        VirtualFileWriter::AlignedSize(sizeof(label_idx_)) +
        VirtualFileWriter::AlignedSize(sizeof(num_groups_)) +
        3 * VirtualFileWriter::AlignedSize(sizeof(int) * num_features_) +
        sizeof(uint64_t) * (num_groups_ + 1) +
        2 * VirtualFileWriter::AlignedSize(sizeof(int) * num_groups_) +
        VirtualFileWriter::AlignedSize(sizeof(int32_t) * num_total_features_) +
        VirtualFileWriter::AlignedSize(sizeof(int)) * 3 +
999
        VirtualFileWriter::AlignedSize(sizeof(bool)) * 3;
1000
1001
    // size of feature names
    for (int i = 0; i < num_total_features_; ++i) {
1002
1003
1004
      size_of_header +=
          VirtualFileWriter::AlignedSize(feature_names_[i].size()) +
          VirtualFileWriter::AlignedSize(sizeof(int));
1005
    }
1006
1007
    // size of forced bins
    for (int i = 0; i < num_total_features_; ++i) {
1008
1009
      size_of_header += forced_bin_bounds_[i].size() * sizeof(double) +
                        VirtualFileWriter::AlignedSize(sizeof(int));
1010
    }
1011
    writer->Write(&size_of_header, sizeof(size_of_header));
Guolin Ke's avatar
Guolin Ke committed
1012
    // write header
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
    writer->AlignedWrite(&num_data_, sizeof(num_data_));
    writer->AlignedWrite(&num_features_, sizeof(num_features_));
    writer->AlignedWrite(&num_total_features_, sizeof(num_total_features_));
    writer->AlignedWrite(&label_idx_, sizeof(label_idx_));
    writer->AlignedWrite(&max_bin_, sizeof(max_bin_));
    writer->AlignedWrite(&bin_construct_sample_cnt_,
                         sizeof(bin_construct_sample_cnt_));
    writer->AlignedWrite(&min_data_in_bin_, sizeof(min_data_in_bin_));
    writer->AlignedWrite(&use_missing_, sizeof(use_missing_));
    writer->AlignedWrite(&zero_as_missing_, sizeof(zero_as_missing_));
1023
    writer->AlignedWrite(&has_raw_, sizeof(has_raw_));
1024
1025
1026
1027
1028
1029
1030
    writer->AlignedWrite(used_feature_map_.data(),
                         sizeof(int) * num_total_features_);
    writer->AlignedWrite(&num_groups_, sizeof(num_groups_));
    writer->AlignedWrite(real_feature_idx_.data(), sizeof(int) * num_features_);
    writer->AlignedWrite(feature2group_.data(), sizeof(int) * num_features_);
    writer->AlignedWrite(feature2subfeature_.data(),
                         sizeof(int) * num_features_);
Guolin Ke's avatar
Guolin Ke committed
1031
1032
    writer->Write(group_bin_boundaries_.data(),
                  sizeof(uint64_t) * (num_groups_ + 1));
1033
1034
1035
    writer->AlignedWrite(group_feature_start_.data(),
                         sizeof(int) * num_groups_);
    writer->AlignedWrite(group_feature_cnt_.data(), sizeof(int) * num_groups_);
Belinda Trotta's avatar
Belinda Trotta committed
1036
1037
1038
    if (max_bin_by_feature_.empty()) {
      ArrayArgs<int32_t>::Assign(&max_bin_by_feature_, -1, num_total_features_);
    }
1039
    writer->AlignedWrite(max_bin_by_feature_.data(),
Guolin Ke's avatar
Guolin Ke committed
1040
                  sizeof(int32_t) * num_total_features_);
Belinda Trotta's avatar
Belinda Trotta committed
1041
1042
1043
    if (ArrayArgs<int32_t>::CheckAll(max_bin_by_feature_, -1)) {
      max_bin_by_feature_.clear();
    }
1044
1045
1046
    // write feature names
    for (int i = 0; i < num_total_features_; ++i) {
      int str_len = static_cast<int>(feature_names_[i].size());
1047
      writer->AlignedWrite(&str_len, sizeof(int));
1048
      const char* c_str = feature_names_[i].c_str();
1049
      writer->AlignedWrite(c_str, sizeof(char) * str_len);
1050
    }
1051
1052
1053
    // write forced bins
    for (int i = 0; i < num_total_features_; ++i) {
      int num_bounds = static_cast<int>(forced_bin_bounds_[i].size());
1054
      writer->AlignedWrite(&num_bounds, sizeof(int));
1055

1056
1057
1058
1059
      for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) {
        writer->Write(&forced_bin_bounds_[i][j], sizeof(double));
      }
    }
1060

Guolin Ke's avatar
Guolin Ke committed
1061
1062
    // get size of meta data
    size_t size_of_metadata = metadata_.SizesInByte();
1063
    writer->Write(&size_of_metadata, sizeof(size_of_metadata));
Guolin Ke's avatar
Guolin Ke committed
1064
    // write meta data
1065
    metadata_.SaveBinaryToFile(writer.get());
Guolin Ke's avatar
Guolin Ke committed
1066
1067

    // write feature data
Guolin Ke's avatar
Guolin Ke committed
1068
    for (int i = 0; i < num_groups_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
1069
      // get size of feature
Guolin Ke's avatar
Guolin Ke committed
1070
      size_t size_of_feature = feature_groups_[i]->SizesInByte();
1071
      writer->Write(&size_of_feature, sizeof(size_of_feature));
Guolin Ke's avatar
Guolin Ke committed
1072
      // write feature
1073
      feature_groups_[i]->SaveBinaryToFile(writer.get());
Guolin Ke's avatar
Guolin Ke committed
1074
    }
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086

    // write raw data; use row-major order so we can read row-by-row
    if (has_raw_) {
      for (int i = 0; i < num_data_; ++i) {
        for (int j = 0; j < num_features_; ++j) {
          int feat_ind = numeric_feature_map_[j];
          if (feat_ind > -1) {
            writer->Write(&raw_data_[feat_ind][i], sizeof(float));
          }
        }
      }
    }
Guolin Ke's avatar
Guolin Ke committed
1087
1088
1089
  }
}

1090
void Dataset::DumpTextFile(const char* text_filename) {
Guolin Ke's avatar
Guolin Ke committed
1091
1092
1093
1094
1095
1096
  FILE* file = NULL;
#if _MSC_VER
  fopen_s(&file, text_filename, "wt");
#else
  file = fopen(text_filename, "wt");
#endif
1097
1098
1099
1100
1101
  fprintf(file, "num_features: %d\n", num_features_);
  fprintf(file, "num_total_features: %d\n", num_total_features_);
  fprintf(file, "num_groups: %d\n", num_groups_);
  fprintf(file, "num_data: %d\n", num_data_);
  fprintf(file, "feature_names: ");
1102
  for (auto n : feature_names_) {
1103
1104
    fprintf(file, "%s, ", n.c_str());
  }
Belinda Trotta's avatar
Belinda Trotta committed
1105
1106
1107
1108
  fprintf(file, "\nmax_bin_by_feature: ");
  for (auto i : max_bin_by_feature_) {
    fprintf(file, "%d, ", i);
  }
1109
  fprintf(file, "\n");
1110
  for (auto n : feature_names_) {
1111
1112
    fprintf(file, "%s, ", n.c_str());
  }
1113
1114
1115
1116
1117
1118
1119
  fprintf(file, "\nforced_bins: ");
  for (int i = 0; i < num_total_features_; ++i) {
    fprintf(file, "\nfeature %d: ", i);
    for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) {
      fprintf(file, "%lf, ", forced_bin_bounds_[i][j]);
    }
  }
1120
1121
  std::vector<std::unique_ptr<BinIterator>> iterators;
  iterators.reserve(num_features_);
1122
  for (int j = 0; j < num_features_; ++j) {
1123
1124
    auto group_idx = feature2group_[j];
    auto sub_idx = feature2subfeature_[j];
Guolin Ke's avatar
Guolin Ke committed
1125
1126
    iterators.emplace_back(
        feature_groups_[group_idx]->SubFeatureIterator(sub_idx));
1127
  }
1128
  for (data_size_t i = 0; i < num_data_; ++i) {
1129
    fprintf(file, "\n");
1130
    for (int j = 0; j < num_total_features_; ++j) {
1131
      auto inner_feature_idx = used_feature_map_[j];
1132
1133
      if (inner_feature_idx < 0) {
        fprintf(file, "NA, ");
1134
      } else {
Guolin Ke's avatar
Guolin Ke committed
1135
        fprintf(file, "%d, ", iterators[inner_feature_idx]->Get(i));
1136
1137
1138
1139
1140
1141
      }
    }
  }
  fclose(file);
}

1142
void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used,
1143
                        TrainingShareStates* share_state) const {
1144
  Common::FunctionTimer fun_time("Dataset::InitTrain", global_timer);
1145
1146
1147
  share_state->InitTrain(group_feature_start_,
        feature_groups_,
        is_feature_used);
1148
1149
}

Guolin Ke's avatar
Guolin Ke committed
1150
template <bool USE_INDICES, bool ORDERED>
1151
1152
1153
1154
1155
1156
void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices,
                                          data_size_t num_data,
                                          const score_t* gradients,
                                          const score_t* hessians,
                                          TrainingShareStates* share_state,
                                          hist_t* hist_data) const {
1157
1158
  Common::FunctionTimer fun_time("Dataset::ConstructHistogramsMultiVal",
                                 global_timer);
1159
1160
  share_state->ConstructHistograms<USE_INDICES, ORDERED>(
      data_indices, num_data, gradients, hessians, hist_data);
1161
1162
}

Guolin Ke's avatar
Guolin Ke committed
1163
1164
template <bool USE_INDICES, bool USE_HESSIAN>
void Dataset::ConstructHistogramsInner(
1165
1166
1167
    const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
    data_size_t num_data, const score_t* gradients, const score_t* hessians,
    score_t* ordered_gradients, score_t* ordered_hessians,
1168
    TrainingShareStates* share_state, hist_t* hist_data) const {
1169
  if (!share_state->is_col_wise) {
Guolin Ke's avatar
Guolin Ke committed
1170
1171
    return ConstructHistogramsMultiVal<USE_INDICES, false>(
        data_indices, num_data, gradients, hessians, share_state, hist_data);
1172
1173
1174
1175
  }
  std::vector<int> used_dense_group;
  int multi_val_groud_id = -1;
  used_dense_group.reserve(num_groups_);
Guolin Ke's avatar
Guolin Ke committed
1176
  for (int group = 0; group < num_groups_; ++group) {
Guolin Ke's avatar
Guolin Ke committed
1177
    const int f_start = group_feature_start_[group];
Guolin Ke's avatar
Guolin Ke committed
1178
    const int f_cnt = group_feature_cnt_[group];
1179
    bool is_group_used = false;
Guolin Ke's avatar
Guolin Ke committed
1180
    for (int j = 0; j < f_cnt; ++j) {
Guolin Ke's avatar
Guolin Ke committed
1181
      const int fidx = f_start + j;
Guolin Ke's avatar
Guolin Ke committed
1182
      if (is_feature_used[fidx]) {
1183
        is_group_used = true;
Guolin Ke's avatar
Guolin Ke committed
1184
1185
1186
        break;
      }
    }
1187
    if (is_group_used) {
1188
1189
1190
1191
      if (feature_groups_[group]->is_multi_val_) {
        multi_val_groud_id = group;
      } else {
        used_dense_group.push_back(group);
1192
      }
Guolin Ke's avatar
Guolin Ke committed
1193
    }
1194
1195
1196
  }
  int num_used_dense_group = static_cast<int>(used_dense_group.size());
  global_timer.Start("Dataset::dense_bin_histogram");
Guolin Ke's avatar
Guolin Ke committed
1197
1198
  auto ptr_ordered_grad = gradients;
  auto ptr_ordered_hess = hessians;
1199
  if (num_used_dense_group > 0) {
Guolin Ke's avatar
Guolin Ke committed
1200
1201
    if (USE_INDICES) {
      if (USE_HESSIAN) {
Guolin Ke's avatar
Guolin Ke committed
1202
#pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
1203
1204
1205
1206
        for (data_size_t i = 0; i < num_data; ++i) {
          ordered_gradients[i] = gradients[data_indices[i]];
          ordered_hessians[i] = hessians[data_indices[i]];
        }
Guolin Ke's avatar
Guolin Ke committed
1207
1208
        ptr_ordered_grad = ordered_gradients;
        ptr_ordered_hess = ordered_hessians;
1209
      } else {
Guolin Ke's avatar
Guolin Ke committed
1210
#pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
1211
1212
        for (data_size_t i = 0; i < num_data; ++i) {
          ordered_gradients[i] = gradients[data_indices[i]];
1213
        }
Guolin Ke's avatar
Guolin Ke committed
1214
        ptr_ordered_grad = ordered_gradients;
1215
      }
Guolin Ke's avatar
Guolin Ke committed
1216
1217
    }
    OMP_INIT_EX();
1218
#pragma omp parallel for schedule(static) num_threads(share_state->num_threads)
Guolin Ke's avatar
Guolin Ke committed
1219
1220
1221
1222
1223
1224
1225
1226
1227
    for (int gi = 0; gi < num_used_dense_group; ++gi) {
      OMP_LOOP_EX_BEGIN();
      int group = used_dense_group[gi];
      auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
      const int num_bin = feature_groups_[group]->num_total_bin_;
      std::memset(reinterpret_cast<void*>(data_ptr), 0,
                  num_bin * kHistEntrySize);
      if (USE_HESSIAN) {
        if (USE_INDICES) {
1228
          feature_groups_[group]->bin_data_->ConstructHistogram(
1229
1230
              data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess,
              data_ptr);
Guolin Ke's avatar
Guolin Ke committed
1231
        } else {
1232
          feature_groups_[group]->bin_data_->ConstructHistogram(
1233
              0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
1234
        }
1235
      } else {
Guolin Ke's avatar
Guolin Ke committed
1236
1237
1238
1239
        if (USE_INDICES) {
          feature_groups_[group]->bin_data_->ConstructHistogram(
              data_indices, 0, num_data, ptr_ordered_grad, data_ptr);
        } else {
1240
1241
          feature_groups_[group]->bin_data_->ConstructHistogram(
              0, num_data, ptr_ordered_grad, data_ptr);
1242
        }
Guolin Ke's avatar
Guolin Ke committed
1243
1244
1245
1246
        auto cnt_dst = reinterpret_cast<hist_cnt_t*>(data_ptr + 1);
        for (int i = 0; i < num_bin * 2; i += 2) {
          data_ptr[i + 1] = static_cast<double>(cnt_dst[i]) * hessians[0];
        }
1247
      }
Guolin Ke's avatar
Guolin Ke committed
1248
      OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
1249
    }
Guolin Ke's avatar
Guolin Ke committed
1250
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
1251
  }
1252
1253
  global_timer.Stop("Dataset::dense_bin_histogram");
  if (multi_val_groud_id >= 0) {
Guolin Ke's avatar
Guolin Ke committed
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
    if (num_used_dense_group > 0) {
      ConstructHistogramsMultiVal<USE_INDICES, true>(
          data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess,
          share_state,
          hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
    } else {
      ConstructHistogramsMultiVal<USE_INDICES, false>(
          data_indices, num_data, gradients, hessians, share_state,
          hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
    }
1264
  }
Guolin Ke's avatar
Guolin Ke committed
1265
1266
}

James Lamb's avatar
James Lamb committed
1267
// explicitly initialize template methods, for cross module call
Guolin Ke's avatar
Guolin Ke committed
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
template void Dataset::ConstructHistogramsInner<true, true>(
    const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
    data_size_t num_data, const score_t* gradients, const score_t* hessians,
    score_t* ordered_gradients, score_t* ordered_hessians,
    TrainingShareStates* share_state, hist_t* hist_data) const;

template void Dataset::ConstructHistogramsInner<true, false>(
    const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
    data_size_t num_data, const score_t* gradients, const score_t* hessians,
    score_t* ordered_gradients, score_t* ordered_hessians,
    TrainingShareStates* share_state, hist_t* hist_data) const;

template void Dataset::ConstructHistogramsInner<false, true>(
    const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
    data_size_t num_data, const score_t* gradients, const score_t* hessians,
    score_t* ordered_gradients, score_t* ordered_hessians,
    TrainingShareStates* share_state, hist_t* hist_data) const;

template void Dataset::ConstructHistogramsInner<false, false>(
    const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
    data_size_t num_data, const score_t* gradients, const score_t* hessians,
    score_t* ordered_gradients, score_t* ordered_hessians,
    TrainingShareStates* share_state, hist_t* hist_data) const;

Guolin Ke's avatar
Guolin Ke committed
1292
1293
void Dataset::FixHistogram(int feature_idx, double sum_gradient,
                           double sum_hessian, hist_t* data) const {
Guolin Ke's avatar
Guolin Ke committed
1294
1295
  const int group = feature2group_[feature_idx];
  const int sub_feature = feature2subfeature_[feature_idx];
Guolin Ke's avatar
Guolin Ke committed
1296
1297
  const BinMapper* bin_mapper =
      feature_groups_[group]->bin_mappers_[sub_feature].get();
Guolin Ke's avatar
Guolin Ke committed
1298
1299
  const int most_freq_bin = bin_mapper->GetMostFreqBin();
  if (most_freq_bin > 0) {
Guolin Ke's avatar
Guolin Ke committed
1300
    const int num_bin = bin_mapper->num_bin();
1301
1302
    GET_GRAD(data, most_freq_bin) = sum_gradient;
    GET_HESS(data, most_freq_bin) = sum_hessian;
Guolin Ke's avatar
Guolin Ke committed
1303
    for (int i = 0; i < num_bin; ++i) {
Guolin Ke's avatar
Guolin Ke committed
1304
      if (i != most_freq_bin) {
1305
1306
        GET_GRAD(data, most_freq_bin) -= GET_GRAD(data, i);
        GET_HESS(data, most_freq_bin) -= GET_HESS(data, i);
Guolin Ke's avatar
Guolin Ke committed
1307
1308
1309
1310
1311
      }
    }
  }
}

Guolin Ke's avatar
Guolin Ke committed
1312
template <typename T>
Guolin Ke's avatar
Guolin Ke committed
1313
1314
void PushVector(std::vector<T>* dest, const std::vector<T>& src) {
  dest->reserve(dest->size() + src.size());
1315
  for (auto i : src) {
Guolin Ke's avatar
Guolin Ke committed
1316
    dest->push_back(i);
1317
1318
1319
  }
}

Guolin Ke's avatar
Guolin Ke committed
1320
1321
1322
template <typename T>
void PushOffset(std::vector<T>* dest, const std::vector<T>& src,
                const T& offset) {
Guolin Ke's avatar
Guolin Ke committed
1323
  dest->reserve(dest->size() + src.size());
1324
  for (auto i : src) {
Guolin Ke's avatar
Guolin Ke committed
1325
    dest->push_back(i + offset);
1326
1327
1328
  }
}

Guolin Ke's avatar
Guolin Ke committed
1329
1330
1331
1332
template <typename T>
void PushClearIfEmpty(std::vector<T>* dest, const size_t dest_len,
                      const std::vector<T>& src, const size_t src_len,
                      const T& deflt) {
Guolin Ke's avatar
Guolin Ke committed
1333
  if (!dest->empty() && !src.empty()) {
1334
    PushVector(dest, src);
Guolin Ke's avatar
Guolin Ke committed
1335
  } else if (!dest->empty() && src.empty()) {
1336
    for (size_t i = 0; i < src_len; ++i) {
Guolin Ke's avatar
Guolin Ke committed
1337
      dest->push_back(deflt);
1338
    }
Guolin Ke's avatar
Guolin Ke committed
1339
  } else if (dest->empty() && !src.empty()) {
1340
    for (size_t i = 0; i < dest_len; ++i) {
Guolin Ke's avatar
Guolin Ke committed
1341
      dest->push_back(deflt);
1342
1343
1344
1345
1346
    }
    PushVector(dest, src);
  }
}

1347
void Dataset::AddFeaturesFrom(Dataset* other) {
1348
  if (other->num_data_ != num_data_) {
1349
    Log::Fatal(
Guolin Ke's avatar
Guolin Ke committed
1350
1351
        "Cannot add features from other Dataset with a different number of "
        "rows");
1352
  }
1353
1354
1355
  if (other->has_raw_ != has_raw_) {
    Log::Fatal("Can only add features from other Dataset if both or neither have raw data.");
  }
Guolin Ke's avatar
Guolin Ke committed
1356
1357
1358
1359
1360
  int mv_gid = -1;
  int other_mv_gid = -1;
  for (int i = 0; i < num_groups_; ++i) {
    if (IsMultiGroup(i)) {
      mv_gid = i;
1361
1362
    }
  }
Guolin Ke's avatar
Guolin Ke committed
1363
1364
1365
1366
  for (int i = 0; i < other->num_groups_; ++i) {
    if (other->IsMultiGroup(i)) {
      other_mv_gid = i;
    }
1367
  }
Guolin Ke's avatar
Guolin Ke committed
1368
1369
1370
1371
1372
1373
  // Only one multi-val group, just simply merge
  if (mv_gid < 0 || other_mv_gid < 0) {
    PushVector(&feature2subfeature_, other->feature2subfeature_);
    PushVector(&group_feature_cnt_, other->group_feature_cnt_);
    feature_groups_.reserve(other->feature_groups_.size());
    for (auto& fg : other->feature_groups_) {
1374
1375
      const int cur_group_id = static_cast<int>(feature_groups_.size());
      feature_groups_.emplace_back(new FeatureGroup(*fg, true, cur_group_id));
Guolin Ke's avatar
Guolin Ke committed
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
    }
    for (auto feature_idx : other->used_feature_map_) {
      if (feature_idx >= 0) {
        used_feature_map_.push_back(feature_idx + num_features_);
      } else {
        used_feature_map_.push_back(-1);  // Unused feature.
      }
    }
    PushOffset(&real_feature_idx_, other->real_feature_idx_,
               num_total_features_);
    PushOffset(&feature2group_, other->feature2group_, num_groups_);
    auto bin_offset = group_bin_boundaries_.back();
    // Skip the leading 0 when copying group_bin_boundaries.
    for (auto i = other->group_bin_boundaries_.begin() + 1;
         i < other->group_bin_boundaries_.end(); ++i) {
      group_bin_boundaries_.push_back(*i + bin_offset);
    }
    PushOffset(&group_feature_start_, other->group_feature_start_,
               num_features_);
    num_groups_ += other->num_groups_;
    num_features_ += other->num_features_;
  } else {
    std::vector<std::vector<int>> features_in_group;
    for (int i = 0; i < num_groups_; ++i) {
      int f_start = group_feature_start_[i];
      int f_cnt = group_feature_cnt_[i];
      features_in_group.emplace_back();
      for (int j = 0; j < f_cnt; ++j) {
1404
1405
        const int real_fidx = real_feature_idx_[f_start + j];
        features_in_group.back().push_back(real_fidx);
Guolin Ke's avatar
Guolin Ke committed
1406
1407
1408
      }
    }
    feature_groups_[mv_gid]->AddFeaturesFrom(
1409
        other->feature_groups_[other_mv_gid].get(), mv_gid);
Guolin Ke's avatar
Guolin Ke committed
1410
1411
1412
1413
1414
    for (int i = 0; i < other->num_groups_; ++i) {
      int f_start = other->group_feature_start_[i];
      int f_cnt = other->group_feature_cnt_[i];
      if (i == other_mv_gid) {
        for (int j = 0; j < f_cnt; ++j) {
1415
1416
          const int real_fidx = other->real_feature_idx_[f_start + j] + num_total_features_;
          features_in_group[mv_gid].push_back(real_fidx);
Guolin Ke's avatar
Guolin Ke committed
1417
1418
1419
1420
        }
      } else {
        features_in_group.emplace_back();
        for (int j = 0; j < f_cnt; ++j) {
1421
1422
          const int real_fidx = other->real_feature_idx_[f_start + j] + num_total_features_;
          features_in_group.back().push_back(real_fidx);
Guolin Ke's avatar
Guolin Ke committed
1423
1424
        }
        feature_groups_.emplace_back(
1425
            new FeatureGroup(*other->feature_groups_[i], false, -1));
Guolin Ke's avatar
Guolin Ke committed
1426
1427
1428
1429
1430
1431
1432
      }
    }
    // regenerate other fields
    num_groups_ += other->num_groups_ - 1;
    CHECK(num_groups_ == static_cast<int>(features_in_group.size()));
    num_features_ += other->num_features_;
    int cur_fidx = 0;
1433
1434
    used_feature_map_ =
      std::vector<int>(num_total_features_ + other->num_total_features_, -1);
Guolin Ke's avatar
Guolin Ke committed
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
    real_feature_idx_.resize(num_features_);
    feature2group_.resize(num_features_);
    feature2subfeature_.resize(num_features_);
    group_feature_start_.resize(num_groups_);
    group_feature_cnt_.resize(num_groups_);

    group_bin_boundaries_.clear();
    uint64_t num_total_bin = 0;
    group_bin_boundaries_.push_back(num_total_bin);
    for (int i = 0; i < num_groups_; ++i) {
      auto cur_features = features_in_group[i];
      int cur_cnt_features = static_cast<int>(cur_features.size());
      group_feature_start_[i] = cur_fidx;
      group_feature_cnt_[i] = cur_cnt_features;
      for (int j = 0; j < cur_cnt_features; ++j) {
        int real_fidx = cur_features[j];
        used_feature_map_[real_fidx] = cur_fidx;
        real_feature_idx_[cur_fidx] = real_fidx;
        feature2group_[cur_fidx] = i;
        feature2subfeature_[cur_fidx] = j;
        ++cur_fidx;
      }
      num_total_bin += feature_groups_[i]->num_total_bin_;
      group_bin_boundaries_.push_back(num_total_bin);
    }
  }
  std::unordered_set<std::string> feature_names_set;
  for (const auto& val : feature_names_) {
    feature_names_set.emplace(val);
  }
  for (const auto& val : other->feature_names_) {
    std::string new_name = val;
    int cnt = 2;
    while (feature_names_set.count(new_name)) {
      new_name = "D" + std::to_string(cnt) + "_" + val;
      ++cnt;
    }
    if (new_name != val) {
      Log::Warning(
        "Find the same feature name (%s) in Dataset::AddFeaturesFrom, change "
        "its name to (%s)",
        val.c_str(), new_name.c_str());
    }
    feature_names_set.emplace(new_name);
    feature_names_.push_back(new_name);
  }
  PushVector(&forced_bin_bounds_, other->forced_bin_bounds_);
  PushClearIfEmpty(&max_bin_by_feature_, num_total_features_,
                   other->max_bin_by_feature_, other->num_total_features_, -1);
1484
  num_total_features_ += other->num_total_features_;
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
  for (size_t i = 0; i < (other->numeric_feature_map_).size(); ++i) {
    int feat_ind = numeric_feature_map_[i];
    if (feat_ind > -1) {
      numeric_feature_map_.push_back(feat_ind + num_numeric_features_);
    } else {
      numeric_feature_map_.push_back(-1);
    }
  }
  num_numeric_features_ += other->num_numeric_features_;
  if (has_raw_) {
    for (int i = 0; i < other->num_numeric_features_; ++i) {
      raw_data_.push_back(other->raw_data_[i]);
    }
  }
1499
1500
1501
1502
1503
1504
1505
  #ifdef USE_CUDA_EXP
  if (device_type_ == std::string("cuda_exp")) {
    CreateCUDAColumnData();
  } else {
    cuda_column_data_ = nullptr;
  }
  #endif  // USE_CUDA_EXP
1506
1507
}

1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
const void* Dataset::GetColWiseData(
  const int feature_group_index,
  const int sub_feature_index,
  uint8_t* bit_type,
  bool* is_sparse,
  std::vector<BinIterator*>* bin_iterator,
  const int num_threads) const {
  return feature_groups_[feature_group_index]->GetColWiseData(sub_feature_index, bit_type, is_sparse, bin_iterator, num_threads);
}

const void* Dataset::GetColWiseData(
  const int feature_group_index,
  const int sub_feature_index,
  uint8_t* bit_type,
  bool* is_sparse,
  BinIterator** bin_iterator) const {
  return feature_groups_[feature_group_index]->GetColWiseData(sub_feature_index, bit_type, is_sparse, bin_iterator);
}

#ifdef USE_CUDA_EXP
void Dataset::CreateCUDAColumnData() {
  cuda_column_data_.reset(new CUDAColumnData(num_data_, gpu_device_id_));
  int num_columns = 0;
  std::vector<const void*> column_data;
  std::vector<BinIterator*> column_bin_iterator;
  std::vector<uint8_t> column_bit_type;
  int feature_index = 0;
  std::vector<int> feature_to_column(num_features_, -1);
  std::vector<uint32_t> feature_max_bins(num_features_, 0);
  std::vector<uint32_t> feature_min_bins(num_features_, 0);
  std::vector<uint32_t> feature_offsets(num_features_, 0);
  std::vector<uint32_t> feature_most_freq_bins(num_features_, 0);
  std::vector<uint32_t> feature_default_bin(num_features_, 0);
  std::vector<uint8_t> feature_missing_is_zero(num_features_, 0);
  std::vector<uint8_t> feature_missing_is_na(num_features_, 0);
  std::vector<uint8_t> feature_mfb_is_zero(num_features_, 0);
  std::vector<uint8_t> feature_mfb_is_na(num_features_, 0);
  for (int feature_group_index = 0; feature_group_index < num_groups_; ++feature_group_index) {
    if (feature_groups_[feature_group_index]->is_multi_val_) {
      for (int sub_feature_index = 0; sub_feature_index < feature_groups_[feature_group_index]->num_feature_; ++sub_feature_index) {
        uint8_t bit_type = 0;
        bool is_sparse = false;
        BinIterator* bin_iterator = nullptr;
        const void* one_column_data = GetColWiseData(feature_group_index,
                                                     sub_feature_index,
                                                     &bit_type,
                                                     &is_sparse,
                                                     &bin_iterator);
        column_data.emplace_back(one_column_data);
        column_bin_iterator.emplace_back(bin_iterator);
        column_bit_type.emplace_back(bit_type);
        feature_to_column[feature_index] = num_columns;
        ++num_columns;
        const BinMapper* feature_bin_mapper = FeatureBinMapper(feature_index);
        feature_max_bins[feature_index] = feature_max_bin(feature_index);
        feature_min_bins[feature_index] = feature_min_bin(feature_index);
        const uint32_t most_freq_bin = feature_bin_mapper->GetMostFreqBin();
        feature_offsets[feature_index] = static_cast<uint32_t>(most_freq_bin == 0);
        feature_most_freq_bins[feature_index] = most_freq_bin;
        feature_default_bin[feature_index] = feature_bin_mapper->GetDefaultBin();
        if (feature_bin_mapper->missing_type() == MissingType::Zero) {
          feature_missing_is_zero[feature_index] = 1;
          feature_missing_is_na[feature_index] = 0;
          if (feature_default_bin[feature_index] == feature_most_freq_bins[feature_index]) {
            feature_mfb_is_zero[feature_index] = 1;
          } else {
            feature_mfb_is_zero[feature_index] = 0;
          }
          feature_mfb_is_na[feature_index] = 0;
        } else if (feature_bin_mapper->missing_type() == MissingType::NaN) {
          feature_missing_is_zero[feature_index] = 0;
          feature_missing_is_na[feature_index] = 1;
          feature_mfb_is_zero[feature_index] = 0;
          if (feature_most_freq_bins[feature_index] + feature_min_bins[feature_index] == feature_max_bins[feature_index] &&
              feature_most_freq_bins[feature_index] > 0) {
            feature_mfb_is_na[feature_index] = 1;
          } else {
            feature_mfb_is_na[feature_index] = 0;
          }
        } else {
          feature_missing_is_zero[feature_index] = 0;
          feature_missing_is_na[feature_index] = 0;
          feature_mfb_is_zero[feature_index] = 0;
          feature_mfb_is_na[feature_index] = 0;
        }
        ++feature_index;
      }
    } else {
      uint8_t bit_type = 0;
      bool is_sparse = false;
      BinIterator* bin_iterator = nullptr;
      const void* one_column_data = GetColWiseData(feature_group_index,
                                                   -1,
                                                   &bit_type,
                                                   &is_sparse,
                                                   &bin_iterator);
      column_data.emplace_back(one_column_data);
      column_bin_iterator.emplace_back(bin_iterator);
      column_bit_type.emplace_back(bit_type);
      for (int sub_feature_index = 0; sub_feature_index < feature_groups_[feature_group_index]->num_feature_; ++sub_feature_index) {
        feature_to_column[feature_index] = num_columns;
        const BinMapper* feature_bin_mapper = FeatureBinMapper(feature_index);
        feature_max_bins[feature_index] = feature_max_bin(feature_index);
        feature_min_bins[feature_index] = feature_min_bin(feature_index);
        const uint32_t most_freq_bin = feature_bin_mapper->GetMostFreqBin();
        feature_offsets[feature_index] = static_cast<uint32_t>(most_freq_bin == 0);
        feature_most_freq_bins[feature_index] = most_freq_bin;
        feature_default_bin[feature_index] = feature_bin_mapper->GetDefaultBin();
        if (feature_bin_mapper->missing_type() == MissingType::Zero) {
          feature_missing_is_zero[feature_index] = 1;
          feature_missing_is_na[feature_index] = 0;
          if (feature_default_bin[feature_index] == feature_most_freq_bins[feature_index]) {
            feature_mfb_is_zero[feature_index] = 1;
          } else {
            feature_mfb_is_zero[feature_index] = 0;
          }
          feature_mfb_is_na[feature_index] = 0;
        } else if (feature_bin_mapper->missing_type() == MissingType::NaN) {
          feature_missing_is_zero[feature_index] = 0;
          feature_missing_is_na[feature_index] = 1;
          feature_mfb_is_zero[feature_index] = 0;
          if (feature_most_freq_bins[feature_index] + feature_min_bins[feature_index] == feature_max_bins[feature_index] &&
              feature_most_freq_bins[feature_index] > 0) {
            feature_mfb_is_na[feature_index] = 1;
          } else {
            feature_mfb_is_na[feature_index] = 0;
          }
        } else {
          feature_missing_is_zero[feature_index] = 0;
          feature_missing_is_na[feature_index] = 0;
          feature_mfb_is_zero[feature_index] = 0;
          feature_mfb_is_na[feature_index] = 0;
        }
        ++feature_index;
      }
      ++num_columns;
    }
  }
  cuda_column_data_->Init(num_columns,
                          column_data,
                          column_bin_iterator,
                          column_bit_type,
                          feature_max_bins,
                          feature_min_bins,
                          feature_offsets,
                          feature_most_freq_bins,
                          feature_default_bin,
                          feature_missing_is_zero,
                          feature_missing_is_na,
                          feature_mfb_is_zero,
                          feature_mfb_is_na,
                          feature_to_column);
}

#endif  // USE_CUDA_EXP

Guolin Ke's avatar
Guolin Ke committed
1664
}  // namespace LightGBM