dataset.cpp 55.7 KB
Newer Older
1
2
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
Guolin Ke's avatar
Guolin Ke committed
3
4
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
5
 */
6
7
8
#include <LightGBM/dataset.h>

#include <LightGBM/feature_group.h>
9
#include <LightGBM/cuda/vector_cudahost.h>
10
11
12
13
#include <LightGBM/utils/array_args.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/utils/threading.h>

14
15
16
17
18
19
#include <chrono>
#include <cstdio>
#include <limits>
#include <sstream>
#include <unordered_map>

Guolin Ke's avatar
Guolin Ke committed
20
21
namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
22
23
const char* Dataset::binary_file_token =
    "______LightGBM_Binary_File_Token______\n";
Guolin Ke's avatar
Guolin Ke committed
24

Guolin Ke's avatar
Guolin Ke committed
25
Dataset::Dataset() {
26
  data_filename_ = "noname";
Guolin Ke's avatar
Guolin Ke committed
27
  num_data_ = 0;
Guolin Ke's avatar
Guolin Ke committed
28
  is_finish_load_ = false;
29
  has_raw_ = false;
Guolin Ke's avatar
Guolin Ke committed
30
31
}

32
Dataset::Dataset(data_size_t num_data) {
33
  CHECK_GT(num_data, 0);
Guolin Ke's avatar
Guolin Ke committed
34
  data_filename_ = "noname";
Guolin Ke's avatar
Guolin Ke committed
35
  num_data_ = num_data;
Guolin Ke's avatar
Guolin Ke committed
36
  metadata_.Init(num_data_, NO_SPECIFIC, NO_SPECIFIC);
Guolin Ke's avatar
Guolin Ke committed
37
  is_finish_load_ = false;
Guolin Ke's avatar
Guolin Ke committed
38
  group_bin_boundaries_.push_back(0);
39
  has_raw_ = false;
Guolin Ke's avatar
Guolin Ke committed
40
41
}

Guolin Ke's avatar
Guolin Ke committed
42
Dataset::~Dataset() {}
Guolin Ke's avatar
Guolin Ke committed
43

Guolin Ke's avatar
Guolin Ke committed
44
std::vector<std::vector<int>> NoGroup(const std::vector<int>& used_features) {
Guolin Ke's avatar
Guolin Ke committed
45
46
47
48
49
50
51
52
  std::vector<std::vector<int>> features_in_group;
  features_in_group.resize(used_features.size());
  for (size_t i = 0; i < used_features.size(); ++i) {
    features_in_group[i].emplace_back(used_features[i]);
  }
  return features_in_group;
}

guanqun's avatar
guanqun committed
53
int GetConflictCount(const std::vector<bool>& mark, const int* indices,
Guolin Ke's avatar
Guolin Ke committed
54
                     int num_indices, data_size_t max_cnt) {
Guolin Ke's avatar
Guolin Ke committed
55
56
57
58
  int ret = 0;
  for (int i = 0; i < num_indices; ++i) {
    if (mark[indices[i]]) {
      ++ret;
59
60
61
    }
    if (ret > max_cnt) {
      return -1;
Guolin Ke's avatar
Guolin Ke committed
62
63
64
65
    }
  }
  return ret;
}
66

Guolin Ke's avatar
Guolin Ke committed
67
68
void MarkUsed(std::vector<bool>* mark, const int* indices,
              data_size_t num_indices) {
Guolin Ke's avatar
Guolin Ke committed
69
  auto& ref_mark = *mark;
Guolin Ke's avatar
Guolin Ke committed
70
  for (int i = 0; i < num_indices; ++i) {
Guolin Ke's avatar
Guolin Ke committed
71
    ref_mark[indices[i]] = true;
Guolin Ke's avatar
Guolin Ke committed
72
73
74
  }
}

Guolin Ke's avatar
Guolin Ke committed
75
76
77
78
std::vector<int> FixSampleIndices(const BinMapper* bin_mapper,
                                  int num_total_samples, int num_indices,
                                  const int* sample_indices,
                                  const double* sample_values) {
Guolin Ke's avatar
Guolin Ke committed
79
80
81
82
83
84
85
86
87
  std::vector<int> ret;
  if (bin_mapper->GetDefaultBin() == bin_mapper->GetMostFreqBin()) {
    return ret;
  }
  int i = 0, j = 0;
  while (i < num_total_samples) {
    if (j < num_indices && sample_indices[j] < i) {
      ++j;
    } else if (j < num_indices && sample_indices[j] == i) {
Guolin Ke's avatar
Guolin Ke committed
88
89
      if (bin_mapper->ValueToBin(sample_values[j]) !=
          bin_mapper->GetMostFreqBin()) {
Guolin Ke's avatar
Guolin Ke committed
90
91
92
93
94
95
96
97
98
99
        ret.push_back(i);
      }
      ++i;
    } else {
      ret.push_back(i++);
    }
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
100
101
102
103
104
105
std::vector<std::vector<int>> FindGroups(
    const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
    const std::vector<int>& find_order, int** sample_indices,
    const int* num_per_col, int num_sample_col, data_size_t total_sample_cnt,
    data_size_t num_data, bool is_use_gpu, bool is_sparse,
    std::vector<int8_t>* multi_val_group) {
Guolin Ke's avatar
Guolin Ke committed
106
  const int max_search_group = 100;
107
  const int max_bin_per_group = 256;
Guolin Ke's avatar
Guolin Ke committed
108
109
  const data_size_t single_val_max_conflict_cnt =
      static_cast<data_size_t>(total_sample_cnt / 10000);
110
111
  multi_val_group->clear();

Guolin Ke's avatar
Guolin Ke committed
112
113
114
  Random rand(num_data);
  std::vector<std::vector<int>> features_in_group;
  std::vector<std::vector<bool>> conflict_marks;
115
116
  std::vector<data_size_t> group_used_row_cnt;
  std::vector<data_size_t> group_total_data_cnt;
Guolin Ke's avatar
Guolin Ke committed
117
118
  std::vector<int> group_num_bin;

119
  // first round: fill the single val group
Guolin Ke's avatar
Guolin Ke committed
120
  for (auto fidx : find_order) {
121
    bool is_filtered_feature = fidx >= num_sample_col;
Guolin Ke's avatar
Guolin Ke committed
122
123
    const data_size_t cur_non_zero_cnt =
        is_filtered_feature ? 0 : num_per_col[fidx];
Guolin Ke's avatar
Guolin Ke committed
124
125
    std::vector<int> available_groups;
    for (int gid = 0; gid < static_cast<int>(features_in_group.size()); ++gid) {
Guolin Ke's avatar
Guolin Ke committed
126
127
128
129
      auto cur_num_bin = group_num_bin[gid] + bin_mappers[fidx]->num_bin() +
                         (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0);
      if (group_total_data_cnt[gid] + cur_non_zero_cnt <=
          total_sample_cnt + single_val_max_conflict_cnt) {
130
        if (!is_use_gpu || cur_num_bin <= max_bin_per_group) {
Guolin Ke's avatar
Guolin Ke committed
131
132
          available_groups.push_back(gid);
        }
Guolin Ke's avatar
Guolin Ke committed
133
134
135
136
137
138
      }
    }
    std::vector<int> search_groups;
    if (!available_groups.empty()) {
      int last = static_cast<int>(available_groups.size()) - 1;
      auto indices = rand.Sample(last, std::min(last, max_search_group - 1));
139
      // always push the last group
Guolin Ke's avatar
Guolin Ke committed
140
141
142
143
144
      search_groups.push_back(available_groups.back());
      for (auto idx : indices) {
        search_groups.push_back(available_groups[idx]);
      }
    }
145
146
    int best_gid = -1;
    int best_conflict_cnt = -1;
Guolin Ke's avatar
Guolin Ke committed
147
    for (auto gid : search_groups) {
Guolin Ke's avatar
Guolin Ke committed
148
149
150
151
152
153
      const data_size_t rest_max_cnt = single_val_max_conflict_cnt -
                                       group_total_data_cnt[gid] +
                                       group_used_row_cnt[gid];
      const data_size_t cnt =
          is_filtered_feature
              ? 0
guanqun's avatar
guanqun committed
154
              : GetConflictCount(conflict_marks[gid], sample_indices[fidx],
Guolin Ke's avatar
Guolin Ke committed
155
                                 num_per_col[fidx], rest_max_cnt);
156
157
158
      if (cnt >= 0 && cnt <= rest_max_cnt && cnt <= cur_non_zero_cnt / 2) {
        best_gid = gid;
        best_conflict_cnt = cnt;
Guolin Ke's avatar
Guolin Ke committed
159
160
161
        break;
      }
    }
162
163
164
165
166
    if (best_gid >= 0) {
      features_in_group[best_gid].push_back(fidx);
      group_total_data_cnt[best_gid] += cur_non_zero_cnt;
      group_used_row_cnt[best_gid] += cur_non_zero_cnt - best_conflict_cnt;
      if (!is_filtered_feature) {
Guolin Ke's avatar
Guolin Ke committed
167
168
        MarkUsed(&conflict_marks[best_gid], sample_indices[fidx],
                 num_per_col[fidx]);
169
      }
Guolin Ke's avatar
Guolin Ke committed
170
171
172
      group_num_bin[best_gid] +=
          bin_mappers[fidx]->num_bin() +
          (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0);
173
    } else {
Guolin Ke's avatar
Guolin Ke committed
174
175
176
      features_in_group.emplace_back();
      features_in_group.back().push_back(fidx);
      conflict_marks.emplace_back(total_sample_cnt, false);
177
      if (!is_filtered_feature) {
Guolin Ke's avatar
Guolin Ke committed
178
179
        MarkUsed(&(conflict_marks.back()), sample_indices[fidx],
                 num_per_col[fidx]);
180
      }
181
182
      group_total_data_cnt.emplace_back(cur_non_zero_cnt);
      group_used_row_cnt.emplace_back(cur_non_zero_cnt);
Guolin Ke's avatar
Guolin Ke committed
183
184
185
      group_num_bin.push_back(
          1 + bin_mappers[fidx]->num_bin() +
          (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0));
186
187
    }
  }
Guolin Ke's avatar
Guolin Ke committed
188
189
190
191
  if (!is_sparse) {
    multi_val_group->resize(features_in_group.size(), false);
    return features_in_group;
  }
192
193
194
195
196
197
  std::vector<int> second_round_features;
  std::vector<std::vector<int>> features_in_group2;
  std::vector<std::vector<bool>> conflict_marks2;

  const double dense_threshold = 0.4;
  for (int gid = 0; gid < static_cast<int>(features_in_group.size()); ++gid) {
Guolin Ke's avatar
Guolin Ke committed
198
199
    const double dense_rate =
        static_cast<double>(group_used_row_cnt[gid]) / total_sample_cnt;
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
    if (dense_rate >= dense_threshold) {
      features_in_group2.push_back(std::move(features_in_group[gid]));
      conflict_marks2.push_back(std::move(conflict_marks[gid]));
    } else {
      for (auto fidx : features_in_group[gid]) {
        second_round_features.push_back(fidx);
      }
    }
  }

  features_in_group = features_in_group2;
  conflict_marks = conflict_marks2;
  multi_val_group->resize(features_in_group.size(), false);
  if (!second_round_features.empty()) {
    features_in_group.emplace_back();
    conflict_marks.emplace_back(total_sample_cnt, false);
    bool is_multi_val = is_use_gpu ? true : false;
    int conflict_cnt = 0;
    for (auto fidx : second_round_features) {
      features_in_group.back().push_back(fidx);
      if (!is_multi_val) {
        const int rest_max_cnt = single_val_max_conflict_cnt - conflict_cnt;
Guolin Ke's avatar
Guolin Ke committed
222
        const auto cnt =
guanqun's avatar
guanqun committed
223
            GetConflictCount(conflict_marks.back(), sample_indices[fidx],
Guolin Ke's avatar
Guolin Ke committed
224
                             num_per_col[fidx], rest_max_cnt);
225
226
227
228
229
        conflict_cnt += cnt;
        if (cnt < 0 || conflict_cnt > single_val_max_conflict_cnt) {
          is_multi_val = true;
          continue;
        }
Guolin Ke's avatar
Guolin Ke committed
230
231
        MarkUsed(&(conflict_marks.back()), sample_indices[fidx],
                 num_per_col[fidx]);
Guolin Ke's avatar
Guolin Ke committed
232
      }
Guolin Ke's avatar
Guolin Ke committed
233
    }
234
    multi_val_group->push_back(is_multi_val);
Guolin Ke's avatar
Guolin Ke committed
235
236
237
238
  }
  return features_in_group;
}

Guolin Ke's avatar
Guolin Ke committed
239
240
241
242
243
244
std::vector<std::vector<int>> FastFeatureBundling(
    const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
    int** sample_indices, double** sample_values, const int* num_per_col,
    int num_sample_col, data_size_t total_sample_cnt,
    const std::vector<int>& used_features, data_size_t num_data,
    bool is_use_gpu, bool is_sparse, std::vector<int8_t>* multi_val_group) {
245
  Common::FunctionTimer fun_timer("Dataset::FastFeatureBundling", global_timer);
Guolin Ke's avatar
Guolin Ke committed
246
  std::vector<size_t> feature_non_zero_cnt;
247
  feature_non_zero_cnt.reserve(used_features.size());
Guolin Ke's avatar
Guolin Ke committed
248
249
  // put dense feature first
  for (auto fidx : used_features) {
250
251
252
253
254
    if (fidx < num_sample_col) {
      feature_non_zero_cnt.emplace_back(num_per_col[fidx]);
    } else {
      feature_non_zero_cnt.emplace_back(0);
    }
Guolin Ke's avatar
Guolin Ke committed
255
256
257
  }
  // sort by non zero cnt
  std::vector<int> sorted_idx;
258
  sorted_idx.reserve(used_features.size());
259
  for (int i = 0; i < static_cast<int>(used_features.size()); ++i) {
Guolin Ke's avatar
Guolin Ke committed
260
261
262
    sorted_idx.emplace_back(i);
  }
  // sort by non zero cnt, bigger first
263
264
  std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
                   [&feature_non_zero_cnt](int a, int b) {
Guolin Ke's avatar
Guolin Ke committed
265
266
                     return feature_non_zero_cnt[a] > feature_non_zero_cnt[b];
                   });
Guolin Ke's avatar
Guolin Ke committed
267
268

  std::vector<int> feature_order_by_cnt;
269
  feature_order_by_cnt.reserve(sorted_idx.size());
Guolin Ke's avatar
Guolin Ke committed
270
271
272
  for (auto sidx : sorted_idx) {
    feature_order_by_cnt.push_back(used_features[sidx]);
  }
273

Guolin Ke's avatar
Guolin Ke committed
274
275
276
277
278
279
  std::vector<std::vector<int>> tmp_indices;
  std::vector<int> tmp_num_per_col(num_sample_col, 0);
  for (auto fidx : used_features) {
    if (fidx >= num_sample_col) {
      continue;
    }
Guolin Ke's avatar
Guolin Ke committed
280
281
282
    auto ret = FixSampleIndices(
        bin_mappers[fidx].get(), static_cast<int>(total_sample_cnt),
        num_per_col[fidx], sample_indices[fidx], sample_values[fidx]);
Guolin Ke's avatar
Guolin Ke committed
283
284
285
286
287
288
289
290
    if (!ret.empty()) {
      tmp_indices.push_back(ret);
      tmp_num_per_col[fidx] = static_cast<int>(ret.size());
      sample_indices[fidx] = tmp_indices.back().data();
    } else {
      tmp_num_per_col[fidx] = num_per_col[fidx];
    }
  }
291
  std::vector<int8_t> group_is_multi_val, group_is_multi_val2;
Guolin Ke's avatar
Guolin Ke committed
292
293
294
295
296
297
298
299
  auto features_in_group =
      FindGroups(bin_mappers, used_features, sample_indices,
                 tmp_num_per_col.data(), num_sample_col, total_sample_cnt,
                 num_data, is_use_gpu, is_sparse, &group_is_multi_val);
  auto group2 =
      FindGroups(bin_mappers, feature_order_by_cnt, sample_indices,
                 tmp_num_per_col.data(), num_sample_col, total_sample_cnt,
                 num_data, is_use_gpu, is_sparse, &group_is_multi_val2);
300

Guolin Ke's avatar
Guolin Ke committed
301
302
  if (features_in_group.size() > group2.size()) {
    features_in_group = group2;
303
    group_is_multi_val = group_is_multi_val2;
Guolin Ke's avatar
Guolin Ke committed
304
305
  }
  // shuffle groups
306
307
  int num_group = static_cast<int>(features_in_group.size());
  Random tmp_rand(num_data);
Guolin Ke's avatar
Guolin Ke committed
308
309
  for (int i = 0; i < num_group - 1; ++i) {
    int j = tmp_rand.NextShort(i + 1, num_group);
310
    std::swap(features_in_group[i], features_in_group[j]);
311
    // Using std::swap for vector<bool> will cause the wrong result.
312
    std::swap(group_is_multi_val[i], group_is_multi_val[j]);
Guolin Ke's avatar
Guolin Ke committed
313
  }
314
315
  *multi_val_group = group_is_multi_val;
  return features_in_group;
Guolin Ke's avatar
Guolin Ke committed
316
317
}

Guolin Ke's avatar
Guolin Ke committed
318
319
320
321
322
323
void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
                        int num_total_features,
                        const std::vector<std::vector<double>>& forced_bins,
                        int** sample_non_zero_indices, double** sample_values,
                        const int* num_per_col, int num_sample_col,
                        size_t total_sample_cnt, const Config& io_config) {
324
  num_total_features_ = num_total_features;
Nikita Titov's avatar
Nikita Titov committed
325
  CHECK_EQ(num_total_features_, static_cast<int>(bin_mappers->size()));
Guolin Ke's avatar
Guolin Ke committed
326
327
  // get num_features
  std::vector<int> used_features;
Guolin Ke's avatar
Guolin Ke committed
328
  auto& ref_bin_mappers = *bin_mappers;
Guolin Ke's avatar
Guolin Ke committed
329
  for (int i = 0; i < static_cast<int>(bin_mappers->size()); ++i) {
Guolin Ke's avatar
Guolin Ke committed
330
    if (ref_bin_mappers[i] != nullptr && !ref_bin_mappers[i]->is_trivial()) {
Guolin Ke's avatar
Guolin Ke committed
331
      used_features.emplace_back(i);
Guolin Ke's avatar
Guolin Ke committed
332
    }
Guolin Ke's avatar
Guolin Ke committed
333
  }
Guolin Ke's avatar
Guolin Ke committed
334
  if (used_features.empty()) {
Guolin Ke's avatar
Guolin Ke committed
335
    Log::Warning(
336
337
338
        "There are no meaningful features which satisfy the provided configuration. "
        "Decreasing Dataset parameters min_data_in_bin or min_data_in_leaf and re-constructing "
        "Dataset might resolve this warning.");
Guolin Ke's avatar
Guolin Ke committed
339
  }
Guolin Ke's avatar
Guolin Ke committed
340
  auto features_in_group = NoGroup(used_features);
341
342
343
344
345
346
347
348
349
350

  auto is_sparse = io_config.is_enable_sparse;
  if (io_config.device_type == std::string("cuda")) {
      LGBM_config_::current_device = lgbm_device_cuda;
      if (is_sparse) {
        Log::Warning("Using sparse features with CUDA is currently not supported.");
      }
      is_sparse = false;
  }

351
  std::vector<int8_t> group_is_multi_val(used_features.size(), 0);
352
  if (io_config.enable_bundle && !used_features.empty()) {
353
    bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda");
Guolin Ke's avatar
Guolin Ke committed
354
355
356
    features_in_group = FastFeatureBundling(
        *bin_mappers, sample_non_zero_indices, sample_values, num_per_col,
        num_sample_col, static_cast<data_size_t>(total_sample_cnt),
357
358
        used_features, num_data_, lgbm_is_gpu_used,
        is_sparse, &group_is_multi_val);
Guolin Ke's avatar
Guolin Ke committed
359
360
  }

Guolin Ke's avatar
Guolin Ke committed
361
362
363
364
365
366
367
368
369
370
  num_features_ = 0;
  for (const auto& fs : features_in_group) {
    num_features_ += static_cast<int>(fs.size());
  }
  int cur_fidx = 0;
  used_feature_map_ = std::vector<int>(num_total_features_, -1);
  num_groups_ = static_cast<int>(features_in_group.size());
  real_feature_idx_.resize(num_features_);
  feature2group_.resize(num_features_);
  feature2subfeature_.resize(num_features_);
Guolin Ke's avatar
Guolin Ke committed
371
  feature_need_push_zeros_.clear();
Guolin Ke's avatar
Guolin Ke committed
372
373
374
375
376
  group_bin_boundaries_.clear();
  uint64_t num_total_bin = 0;
  group_bin_boundaries_.push_back(num_total_bin);
  group_feature_start_.resize(num_groups_);
  group_feature_cnt_.resize(num_groups_);
Guolin Ke's avatar
Guolin Ke committed
377
378
379
  for (int i = 0; i < num_groups_; ++i) {
    auto cur_features = features_in_group[i];
    int cur_cnt_features = static_cast<int>(cur_features.size());
Guolin Ke's avatar
Guolin Ke committed
380
381
    group_feature_start_[i] = cur_fidx;
    group_feature_cnt_[i] = cur_cnt_features;
Guolin Ke's avatar
Guolin Ke committed
382
383
384
385
386
387
388
389
    // get bin_mappers
    std::vector<std::unique_ptr<BinMapper>> cur_bin_mappers;
    for (int j = 0; j < cur_cnt_features; ++j) {
      int real_fidx = cur_features[j];
      used_feature_map_[real_fidx] = cur_fidx;
      real_feature_idx_[cur_fidx] = real_fidx;
      feature2group_[cur_fidx] = i;
      feature2subfeature_[cur_fidx] = j;
Guolin Ke's avatar
Guolin Ke committed
390
      cur_bin_mappers.emplace_back(ref_bin_mappers[real_fidx].release());
Guolin Ke's avatar
Guolin Ke committed
391
392
      if (cur_bin_mappers.back()->GetDefaultBin() !=
          cur_bin_mappers.back()->GetMostFreqBin()) {
Guolin Ke's avatar
Guolin Ke committed
393
394
        feature_need_push_zeros_.push_back(cur_fidx);
      }
Guolin Ke's avatar
Guolin Ke committed
395
396
      ++cur_fidx;
    }
Guolin Ke's avatar
Guolin Ke committed
397
    feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(
398
      new FeatureGroup(cur_cnt_features, group_is_multi_val[i], &cur_bin_mappers, num_data_, i)));
Guolin Ke's avatar
Guolin Ke committed
399
400
401
    num_total_bin += feature_groups_[i]->num_total_bin_;
    group_bin_boundaries_.push_back(num_total_bin);
  }
Belinda Trotta's avatar
Belinda Trotta committed
402
  if (!io_config.max_bin_by_feature.empty()) {
403
404
405
406
    CHECK_EQ(static_cast<size_t>(num_total_features_),
             io_config.max_bin_by_feature.size());
    CHECK_GT(*(std::min_element(io_config.max_bin_by_feature.begin(),
                                io_config.max_bin_by_feature.end())), 1);
Belinda Trotta's avatar
Belinda Trotta committed
407
    max_bin_by_feature_.resize(num_total_features_);
Guolin Ke's avatar
Guolin Ke committed
408
409
    max_bin_by_feature_.assign(io_config.max_bin_by_feature.begin(),
                               io_config.max_bin_by_feature.end());
Belinda Trotta's avatar
Belinda Trotta committed
410
  }
411
  forced_bin_bounds_ = forced_bins;
412
413
414
415
416
  max_bin_ = io_config.max_bin;
  min_data_in_bin_ = io_config.min_data_in_bin;
  bin_construct_sample_cnt_ = io_config.bin_construct_sample_cnt;
  use_missing_ = io_config.use_missing;
  zero_as_missing_ = io_config.zero_as_missing;
417
418
419
420
421
422
423
424
425
426
427
428
  has_raw_ = false;
  if (io_config.linear_tree) {
    has_raw_ = true;
  }
  numeric_feature_map_ = std::vector<int>(num_features_, -1);
  num_numeric_features_ = 0;
  for (int i = 0; i < num_features_; ++i) {
    if (FeatureBinMapper(i)->bin_type() == BinType::NumericalBin) {
      numeric_feature_map_[i] = num_numeric_features_;
      ++num_numeric_features_;
    }
  }
429
430
}

Guolin Ke's avatar
Guolin Ke committed
431
void Dataset::FinishLoad() {
Guolin Ke's avatar
Guolin Ke committed
432
433
434
  if (is_finish_load_) {
    return;
  }
435
436
  if (num_groups_ > 0) {
    for (int i = 0; i < num_groups_; ++i) {
437
      feature_groups_[i]->FinishLoad();
438
    }
Guolin Ke's avatar
Guolin Ke committed
439
  }
Guolin Ke's avatar
Guolin Ke committed
440
  is_finish_load_ = true;
Guolin Ke's avatar
Guolin Ke committed
441
}
Guolin Ke's avatar
Guolin Ke committed
442

443
void PushDataToMultiValBin(
Guolin Ke's avatar
Guolin Ke committed
444
    data_size_t num_data, const std::vector<uint32_t> most_freq_bins,
445
    const std::vector<uint32_t> offsets,
Guolin Ke's avatar
Guolin Ke committed
446
    std::vector<std::vector<std::unique_ptr<BinIterator>>>* iters,
447
448
449
    MultiValBin* ret) {
  Common::FunctionTimer fun_time("Dataset::PushDataToMultiValBin",
                                 global_timer);
450
  if (ret->IsSparse()) {
Guolin Ke's avatar
Guolin Ke committed
451
452
453
454
455
    Threading::For<data_size_t>(
        0, num_data, 1024, [&](int tid, data_size_t start, data_size_t end) {
          std::vector<uint32_t> cur_data;
          cur_data.reserve(most_freq_bins.size());
          for (size_t j = 0; j < most_freq_bins.size(); ++j) {
Guolin Ke's avatar
Guolin Ke committed
456
            (*iters)[tid][j]->Reset(start);
457
          }
Guolin Ke's avatar
Guolin Ke committed
458
459
460
          for (data_size_t i = start; i < end; ++i) {
            cur_data.clear();
            for (size_t j = 0; j < most_freq_bins.size(); ++j) {
461
              // for sparse multi value bin, we store the feature bin values with offset added
Guolin Ke's avatar
Guolin Ke committed
462
              auto cur_bin = (*iters)[tid][j]->Get(i);
Guolin Ke's avatar
Guolin Ke committed
463
464
465
466
467
468
469
470
471
472
              if (cur_bin == most_freq_bins[j]) {
                continue;
              }
              cur_bin += offsets[j];
              if (most_freq_bins[j] == 0) {
                cur_bin -= 1;
              }
              cur_data.push_back(cur_bin);
            }
            ret->PushOneRow(tid, i, cur_data);
473
          }
Guolin Ke's avatar
Guolin Ke committed
474
        });
475
  } else {
Guolin Ke's avatar
Guolin Ke committed
476
477
478
479
    Threading::For<data_size_t>(
        0, num_data, 1024, [&](int tid, data_size_t start, data_size_t end) {
          std::vector<uint32_t> cur_data(most_freq_bins.size(), 0);
          for (size_t j = 0; j < most_freq_bins.size(); ++j) {
Guolin Ke's avatar
Guolin Ke committed
480
            (*iters)[tid][j]->Reset(start);
Guolin Ke's avatar
Guolin Ke committed
481
482
483
          }
          for (data_size_t i = start; i < end; ++i) {
            for (size_t j = 0; j < most_freq_bins.size(); ++j) {
484
              // for dense multi value bin, the feature bin values without offsets are used
Guolin Ke's avatar
Guolin Ke committed
485
              auto cur_bin = (*iters)[tid][j]->Get(i);
Guolin Ke's avatar
Guolin Ke committed
486
              cur_data[j] = cur_bin;
487
            }
Guolin Ke's avatar
Guolin Ke committed
488
            ret->PushOneRow(tid, i, cur_data);
489
          }
Guolin Ke's avatar
Guolin Ke committed
490
        });
491
492
493
  }
}

494
MultiValBin* Dataset::GetMultiBinFromSparseFeatures(const std::vector<uint32_t>& offsets) const {
495
496
  Common::FunctionTimer fun_time("Dataset::GetMultiBinFromSparseFeatures",
                                 global_timer);
497
498
499
500
501
502
503
504
505
506
507
508
509
510
  int multi_group_id = -1;
  for (int i = 0; i < num_groups_; ++i) {
    if (feature_groups_[i]->is_multi_val_) {
      if (multi_group_id < 0) {
        multi_group_id = i;
      } else {
        Log::Fatal("Bug. There should be only one multi-val group.");
      }
    }
  }
  if (multi_group_id < 0) {
    return nullptr;
  }
  const int num_feature = feature_groups_[multi_group_id]->num_feature_;
511
  int num_threads = OMP_NUM_THREADS();
512
513
514
515
516

  std::vector<std::vector<std::unique_ptr<BinIterator>>> iters(num_threads);
  std::vector<uint32_t> most_freq_bins;
  double sum_sparse_rate = 0;
  for (int i = 0; i < num_feature; ++i) {
517
#pragma omp parallel for schedule(static, 1)
518
    for (int tid = 0; tid < num_threads; ++tid) {
519
520
      iters[tid].emplace_back(
          feature_groups_[multi_group_id]->SubFeatureIterator(i));
521
    }
522
523
524
525
    most_freq_bins.push_back(
        feature_groups_[multi_group_id]->bin_mappers_[i]->GetMostFreqBin());
    sum_sparse_rate +=
        feature_groups_[multi_group_id]->bin_mappers_[i]->sparse_rate();
526
527
  }
  sum_sparse_rate /= num_feature;
528
529
  Log::Debug("Dataset::GetMultiBinFromSparseFeatures: sparse rate %f",
             sum_sparse_rate);
530
  std::unique_ptr<MultiValBin> ret;
531
  ret.reset(MultiValBin::CreateMultiValBin(num_data_, offsets.back(),
532
                                           num_feature, sum_sparse_rate, offsets));
Guolin Ke's avatar
Guolin Ke committed
533
  PushDataToMultiValBin(num_data_, most_freq_bins, offsets, &iters, ret.get());
534
535
536
537
  ret->FinishLoad();
  return ret.release();
}

538
MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector<uint32_t>& offsets) const {
539
540
  Common::FunctionTimer fun_time("Dataset::GetMultiBinFromAllFeatures",
                                 global_timer);
541
  int num_threads = OMP_NUM_THREADS();
542
543
544
545
546
  double sum_dense_ratio = 0;

  std::unique_ptr<MultiValBin> ret;
  std::vector<std::vector<std::unique_ptr<BinIterator>>> iters(num_threads);
  std::vector<uint32_t> most_freq_bins;
547
548
549
550
551
552
553
554
555
556
557
558
559
  int ncol = 0;
  for (int gid = 0; gid < num_groups_; ++gid) {
    if (feature_groups_[gid]->is_multi_val_) {
      ncol += feature_groups_[gid]->num_feature_;
    } else {
      ++ncol;
    }
    for (int fid = 0; fid < feature_groups_[gid]->num_feature_; ++fid) {
      const auto& bin_mapper = feature_groups_[gid]->bin_mappers_[fid];
      sum_dense_ratio += 1.0f - bin_mapper->sparse_rate();
    }
  }
  sum_dense_ratio /= ncol;
560
561
562
563
564
  for (int gid = 0; gid < num_groups_; ++gid) {
    if (feature_groups_[gid]->is_multi_val_) {
      for (int fid = 0; fid < feature_groups_[gid]->num_feature_; ++fid) {
        const auto& bin_mapper = feature_groups_[gid]->bin_mappers_[fid];
        most_freq_bins.push_back(bin_mapper->GetMostFreqBin());
565
#pragma omp parallel for schedule(static, 1)
566
        for (int tid = 0; tid < num_threads; ++tid) {
567
568
          iters[tid].emplace_back(
              feature_groups_[gid]->SubFeatureIterator(fid));
569
570
571
572
573
574
575
576
577
        }
      }
    } else {
      most_freq_bins.push_back(0);
      for (int tid = 0; tid < num_threads; ++tid) {
        iters[tid].emplace_back(feature_groups_[gid]->FeatureGroupIterator());
      }
    }
  }
578
  CHECK(static_cast<int>(most_freq_bins.size()) == ncol);
579
580
581
  Log::Debug("Dataset::GetMultiBinFromAllFeatures: sparse rate %f",
             1.0 - sum_dense_ratio);
  ret.reset(MultiValBin::CreateMultiValBin(
582
      num_data_, offsets.back(), static_cast<int>(most_freq_bins.size()),
583
      1.0 - sum_dense_ratio, offsets));
Guolin Ke's avatar
Guolin Ke committed
584
  PushDataToMultiValBin(num_data_, most_freq_bins, offsets, &iters, ret.get());
585
586
587
588
  ret->FinishLoad();
  return ret.release();
}

589
TrainingShareStates* Dataset::GetShareStates(
590
591
    score_t* gradients, score_t* hessians,
    const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
592
    bool force_col_wise, bool force_row_wise) const {
593
594
  Common::FunctionTimer fun_timer("Dataset::TestMultiThreadingMethod",
                                  global_timer);
595
  if (force_col_wise && force_row_wise) {
596
    Log::Fatal(
597
        "Cannot set both of `force_col_wise` and `force_row_wise` to `true` at "
598
        "the same time");
599
600
  }
  if (num_groups_ <= 0) {
601
    TrainingShareStates* share_state = new TrainingShareStates();
602
    share_state->is_col_wise = true;
603
604
    share_state->is_constant_hessian = is_constant_hessian;
    return share_state;
605
  }
606
  if (force_col_wise) {
607
    TrainingShareStates* share_state = new TrainingShareStates();
608
609
610
611
612
613
    std::vector<uint32_t> offsets;
    share_state->CalcBinOffsets(
      feature_groups_, &offsets, true);
    share_state->SetMultiValBin(GetMultiBinFromSparseFeatures(offsets),
      num_data_, feature_groups_, false, true);
    share_state->is_col_wise = true;
614
615
    share_state->is_constant_hessian = is_constant_hessian;
    return share_state;
616
  } else if (force_row_wise) {
617
    TrainingShareStates* share_state = new TrainingShareStates();
618
619
620
621
622
623
    std::vector<uint32_t> offsets;
    share_state->CalcBinOffsets(
      feature_groups_, &offsets, false);
    share_state->SetMultiValBin(GetMultiBinFromAllFeatures(offsets), num_data_,
      feature_groups_, false, false);
    share_state->is_col_wise = false;
624
625
    share_state->is_constant_hessian = is_constant_hessian;
    return share_state;
626
627
628
  } else {
    std::unique_ptr<MultiValBin> sparse_bin;
    std::unique_ptr<MultiValBin> all_bin;
629
630
631
632
    std::unique_ptr<TrainingShareStates> col_wise_state;
    std::unique_ptr<TrainingShareStates> row_wise_state;
    col_wise_state.reset(new TrainingShareStates());
    row_wise_state.reset(new TrainingShareStates());
633

634
    std::chrono::duration<double, std::milli> col_wise_init_time, row_wise_init_time;
635
    auto start_time = std::chrono::steady_clock::now();
636
637
638
639
    std::vector<uint32_t> col_wise_offsets;
    col_wise_state->CalcBinOffsets(feature_groups_, &col_wise_offsets, true);
    col_wise_state->SetMultiValBin(GetMultiBinFromSparseFeatures(col_wise_offsets), num_data_,
      feature_groups_, false, true);
640
    col_wise_init_time = std::chrono::steady_clock::now() - start_time;
641

642
    start_time = std::chrono::steady_clock::now();
643
644
645
646
647
648
649
650
    std::vector<uint32_t> row_wise_offsets;
    row_wise_state->CalcBinOffsets(feature_groups_, &row_wise_offsets, false);
    row_wise_state->SetMultiValBin(GetMultiBinFromAllFeatures(row_wise_offsets), num_data_,
      feature_groups_, false, false);
    row_wise_init_time = std::chrono::steady_clock::now() - start_time;

    uint64_t max_total_bin = std::max<uint64_t>(row_wise_state->num_hist_total_bin(),
      col_wise_state->num_hist_total_bin());
651
    std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>
652
        hist_data(max_total_bin * 2);
653
654

    Log::Debug(
655
656
657
658
659
660
661
662
663
      "init for col-wise cost %f seconds, init for row-wise cost %f seconds",
      col_wise_init_time * 1e-3, row_wise_init_time * 1e-3);

    col_wise_state->is_col_wise = true;
    col_wise_state->is_constant_hessian = is_constant_hessian;
    InitTrain(is_feature_used, col_wise_state.get());
    row_wise_state->is_col_wise = false;
    row_wise_state->is_constant_hessian = is_constant_hessian;
    InitTrain(is_feature_used, row_wise_state.get());
664
    std::chrono::duration<double, std::milli> col_wise_time, row_wise_time;
665
    start_time = std::chrono::steady_clock::now();
666
    ConstructHistograms(is_feature_used, nullptr, num_data_, gradients,
667
                        hessians, gradients, hessians, col_wise_state.get(),
668
                        hist_data.data());
669
670
    col_wise_time = std::chrono::steady_clock::now() - start_time;
    start_time = std::chrono::steady_clock::now();
Guolin Ke's avatar
Guolin Ke committed
671
    ConstructHistograms(is_feature_used, nullptr, num_data_, gradients,
672
                        hessians, gradients, hessians, row_wise_state.get(),
Guolin Ke's avatar
Guolin Ke committed
673
                        hist_data.data());
674
    row_wise_time = std::chrono::steady_clock::now() - start_time;
675

676
    if (col_wise_time < row_wise_time) {
677
678
      auto overhead_cost = row_wise_init_time + row_wise_time + col_wise_time;
      Log::Warning(
679
          "Auto-choosing col-wise multi-threading, the overhead of testing was "
Nikita Titov's avatar
Nikita Titov committed
680
681
          "%f seconds.\n"
          "You can set `force_col_wise=true` to remove the overhead.",
682
          overhead_cost * 1e-3);
683
      return col_wise_state.release();
684
    } else {
685
686
      auto overhead_cost = col_wise_init_time + row_wise_time + col_wise_time;
      Log::Warning(
687
          "Auto-choosing row-wise multi-threading, the overhead of testing was "
Nikita Titov's avatar
Nikita Titov committed
688
689
690
          "%f seconds.\n"
          "You can set `force_row_wise=true` to remove the overhead.\n"
          "And if memory is not enough, you can set `force_col_wise=true`.",
691
          overhead_cost * 1e-3);
692
      if (row_wise_state->IsSparseRowwise()) {
693
        Log::Debug("Using Sparse Multi-Val Bin");
694
      } else {
695
        Log::Debug("Using Dense Multi-Val Bin");
696
      }
697
      return row_wise_state.release();
698
699
700
701
    }
  }
}

702
void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
Guolin Ke's avatar
Guolin Ke committed
703
  feature_groups_.clear();
Guolin Ke's avatar
Guolin Ke committed
704
  num_features_ = dataset->num_features_;
Guolin Ke's avatar
Guolin Ke committed
705
  num_groups_ = dataset->num_groups_;
706
  has_raw_ = dataset->has_raw();
Guolin Ke's avatar
Guolin Ke committed
707
  // copy feature bin mapper data
Guolin Ke's avatar
Guolin Ke committed
708
  for (int i = 0; i < num_groups_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
709
710
    feature_groups_.emplace_back(
        new FeatureGroup(*dataset->feature_groups_[i], num_data_));
Guolin Ke's avatar
Guolin Ke committed
711
  }
Guolin Ke's avatar
Guolin Ke committed
712
  feature_groups_.shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
713
714
715
  used_feature_map_ = dataset->used_feature_map_;
  num_total_features_ = dataset->num_total_features_;
  feature_names_ = dataset->feature_names_;
Guolin Ke's avatar
Guolin Ke committed
716
  label_idx_ = dataset->label_idx_;
Guolin Ke's avatar
Guolin Ke committed
717
718
719
720
721
722
  real_feature_idx_ = dataset->real_feature_idx_;
  feature2group_ = dataset->feature2group_;
  feature2subfeature_ = dataset->feature2subfeature_;
  group_bin_boundaries_ = dataset->group_bin_boundaries_;
  group_feature_start_ = dataset->group_feature_start_;
  group_feature_cnt_ = dataset->group_feature_cnt_;
723
  forced_bin_bounds_ = dataset->forced_bin_bounds_;
Guolin Ke's avatar
Guolin Ke committed
724
  feature_need_push_zeros_ = dataset->feature_need_push_zeros_;
Guolin Ke's avatar
Guolin Ke committed
725
726
727
728
729
730
}

void Dataset::CreateValid(const Dataset* dataset) {
  feature_groups_.clear();
  num_features_ = dataset->num_features_;
  num_groups_ = num_features_;
731
732
733
734
735
  max_bin_ = dataset->max_bin_;
  min_data_in_bin_ = dataset->min_data_in_bin_;
  bin_construct_sample_cnt_ = dataset->bin_construct_sample_cnt_;
  use_missing_ = dataset->use_missing_;
  zero_as_missing_ = dataset->zero_as_missing_;
Guolin Ke's avatar
Guolin Ke committed
736
737
  feature2group_.clear();
  feature2subfeature_.clear();
738
739
740
741
  has_raw_ = dataset->has_raw();
  numeric_feature_map_ = dataset->numeric_feature_map_;
  num_numeric_features_ = dataset->num_numeric_features_;
  // copy feature bin mapper data
Guolin Ke's avatar
Guolin Ke committed
742
  feature_need_push_zeros_.clear();
Guolin Ke's avatar
Guolin Ke committed
743
744
745
746
747
  group_bin_boundaries_.clear();
  uint64_t num_total_bin = 0;
  group_bin_boundaries_.push_back(num_total_bin);
  group_feature_start_.resize(num_groups_);
  group_feature_cnt_.resize(num_groups_);
Guolin Ke's avatar
Guolin Ke committed
748
749
750
  for (int i = 0; i < num_features_; ++i) {
    std::vector<std::unique_ptr<BinMapper>> bin_mappers;
    bin_mappers.emplace_back(new BinMapper(*(dataset->FeatureBinMapper(i))));
Guolin Ke's avatar
Guolin Ke committed
751
752
    if (bin_mappers.back()->GetDefaultBin() !=
        bin_mappers.back()->GetMostFreqBin()) {
Guolin Ke's avatar
Guolin Ke committed
753
754
      feature_need_push_zeros_.push_back(i);
    }
755
    feature_groups_.emplace_back(new FeatureGroup(&bin_mappers, num_data_));
Guolin Ke's avatar
Guolin Ke committed
756
757
    feature2group_.push_back(i);
    feature2subfeature_.push_back(0);
Guolin Ke's avatar
Guolin Ke committed
758
759
760
761
    num_total_bin += feature_groups_[i]->num_total_bin_;
    group_bin_boundaries_.push_back(num_total_bin);
    group_feature_start_[i] = i;
    group_feature_cnt_[i] = 1;
Guolin Ke's avatar
Guolin Ke committed
762
763
764
765
766
767
768
769
  }

  feature_groups_.shrink_to_fit();
  used_feature_map_ = dataset->used_feature_map_;
  num_total_features_ = dataset->num_total_features_;
  feature_names_ = dataset->feature_names_;
  label_idx_ = dataset->label_idx_;
  real_feature_idx_ = dataset->real_feature_idx_;
770
  forced_bin_bounds_ = dataset->forced_bin_bounds_;
Guolin Ke's avatar
Guolin Ke committed
771
772
}

Guolin Ke's avatar
Guolin Ke committed
773
774
775
void Dataset::ReSize(data_size_t num_data) {
  if (num_data_ != num_data) {
    num_data_ = num_data;
776
    OMP_INIT_EX();
Guolin Ke's avatar
Guolin Ke committed
777
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
778
    for (int group = 0; group < num_groups_; ++group) {
779
      OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
780
      feature_groups_[group]->ReSize(num_data_);
781
      OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
782
    }
783
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
784
785
786
  }
}

787
void Dataset::CopySubrow(const Dataset* fullset,
Guolin Ke's avatar
Guolin Ke committed
788
789
                         const data_size_t* used_indices,
                         data_size_t num_used_indices, bool need_meta_data) {
Nikita Titov's avatar
Nikita Titov committed
790
  CHECK_EQ(num_used_indices, num_data_);
791
792
793
794

  std::vector<int> group_ids, subfeature_ids;
  group_ids.reserve(num_features_);
  subfeature_ids.reserve(num_features_);
Guolin Ke's avatar
Guolin Ke committed
795
  for (int group = 0; group < num_groups_; ++group) {
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
    if (fullset->IsMultiGroup(group)) {
      for (int sub_feature = 0; sub_feature <
          fullset->feature_groups_[group]->num_feature_; ++sub_feature) {
        group_ids.emplace_back(group);
        subfeature_ids.emplace_back(sub_feature);
      }
    } else {
      group_ids.emplace_back(group);
      subfeature_ids.emplace_back(-1);
    }
  }
  int num_copy_tasks = static_cast<int>(group_ids.size());

  OMP_INIT_EX();
  #pragma omp parallel for schedule(dynamic)
  for (int task_id = 0; task_id < num_copy_tasks; ++task_id) {
812
    OMP_LOOP_EX_BEGIN();
813
814
815
816
    int group = group_ids[task_id];
    int subfeature = subfeature_ids[task_id];
    feature_groups_[group]->CopySubrowByCol(fullset->feature_groups_[group].get(),
                                            used_indices, num_used_indices, subfeature);
817
    OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
818
  }
819
  OMP_THROW_EX();
820

Guolin Ke's avatar
Guolin Ke committed
821
  if (need_meta_data) {
Guolin Ke's avatar
Guolin Ke committed
822
    metadata_.Init(fullset->metadata_, used_indices, num_used_indices);
Guolin Ke's avatar
Guolin Ke committed
823
  }
Guolin Ke's avatar
Guolin Ke committed
824
  is_finish_load_ = true;
825
826
827
828
829
830
831
832
833
834
835
  numeric_feature_map_ = fullset->numeric_feature_map_;
  num_numeric_features_ = fullset->num_numeric_features_;
  if (has_raw_) {
    ResizeRaw(num_used_indices);
#pragma omp parallel for schedule(static)
    for (int i = 0; i < num_used_indices; ++i) {
      for (int j = 0; j < num_numeric_features_; ++j) {
        raw_data_[j][i] = fullset->raw_data_[j][used_indices[i]];
      }
    }
  }
Guolin Ke's avatar
Guolin Ke committed
836
837
}

Guolin Ke's avatar
Guolin Ke committed
838
839
bool Dataset::SetFloatField(const char* field_name, const float* field_data,
                            data_size_t num_element) {
Guolin Ke's avatar
Guolin Ke committed
840
841
842
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("label") || name == std::string("target")) {
Guolin Ke's avatar
Guolin Ke committed
843
#ifdef LABEL_T_USE_DOUBLE
844
    Log::Fatal("Don't support LABEL_T_USE_DOUBLE");
Guolin Ke's avatar
Guolin Ke committed
845
#else
846
    metadata_.SetLabel(field_data, num_element);
Guolin Ke's avatar
Guolin Ke committed
847
#endif
Guolin Ke's avatar
Guolin Ke committed
848
  } else if (name == std::string("weight") || name == std::string("weights")) {
Guolin Ke's avatar
Guolin Ke committed
849
#ifdef LABEL_T_USE_DOUBLE
850
    Log::Fatal("Don't support LABEL_T_USE_DOUBLE");
Guolin Ke's avatar
Guolin Ke committed
851
#else
852
    metadata_.SetWeights(field_data, num_element);
Guolin Ke's avatar
Guolin Ke committed
853
#endif
Guolin Ke's avatar
Guolin Ke committed
854
855
856
857
858
859
  } else {
    return false;
  }
  return true;
}

Guolin Ke's avatar
Guolin Ke committed
860
861
bool Dataset::SetDoubleField(const char* field_name, const double* field_data,
                             data_size_t num_element) {
Guolin Ke's avatar
Guolin Ke committed
862
863
864
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("init_score")) {
865
    metadata_.SetInitScore(field_data, num_element);
Guolin Ke's avatar
Guolin Ke committed
866
  } else {
867
    return false;
Guolin Ke's avatar
Guolin Ke committed
868
  }
869
  return true;
Guolin Ke's avatar
Guolin Ke committed
870
871
}

Guolin Ke's avatar
Guolin Ke committed
872
873
bool Dataset::SetIntField(const char* field_name, const int* field_data,
                          data_size_t num_element) {
874
875
876
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("query") || name == std::string("group")) {
Guolin Ke's avatar
Guolin Ke committed
877
    metadata_.SetQuery(field_data, num_element);
878
879
880
881
882
883
  } else {
    return false;
  }
  return true;
}

Guolin Ke's avatar
Guolin Ke committed
884
885
bool Dataset::GetFloatField(const char* field_name, data_size_t* out_len,
                            const float** out_ptr) {
886
887
888
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("label") || name == std::string("target")) {
Guolin Ke's avatar
Guolin Ke committed
889
#ifdef LABEL_T_USE_DOUBLE
890
    Log::Fatal("Don't support LABEL_T_USE_DOUBLE");
Guolin Ke's avatar
Guolin Ke committed
891
#else
892
893
    *out_ptr = metadata_.label();
    *out_len = num_data_;
Guolin Ke's avatar
Guolin Ke committed
894
#endif
895
  } else if (name == std::string("weight") || name == std::string("weights")) {
Guolin Ke's avatar
Guolin Ke committed
896
#ifdef LABEL_T_USE_DOUBLE
897
    Log::Fatal("Don't support LABEL_T_USE_DOUBLE");
Guolin Ke's avatar
Guolin Ke committed
898
#else
899
900
    *out_ptr = metadata_.weights();
    *out_len = num_data_;
Guolin Ke's avatar
Guolin Ke committed
901
#endif
Guolin Ke's avatar
Guolin Ke committed
902
903
904
905
906
907
  } else {
    return false;
  }
  return true;
}

Guolin Ke's avatar
Guolin Ke committed
908
909
bool Dataset::GetDoubleField(const char* field_name, data_size_t* out_len,
                             const double** out_ptr) {
Guolin Ke's avatar
Guolin Ke committed
910
911
912
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("init_score")) {
913
    *out_ptr = metadata_.init_score();
Guolin Ke's avatar
Guolin Ke committed
914
    *out_len = static_cast<data_size_t>(metadata_.num_init_score());
915
  } else {
916
917
    return false;
  }
918
  return true;
919
920
}

Guolin Ke's avatar
Guolin Ke committed
921
922
bool Dataset::GetIntField(const char* field_name, data_size_t* out_len,
                          const int** out_ptr) {
923
924
925
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("query") || name == std::string("group")) {
926
    *out_ptr = metadata_.query_boundaries();
Guolin Ke's avatar
Guolin Ke committed
927
    *out_len = metadata_.num_queries() + 1;
Guolin Ke's avatar
Guolin Ke committed
928
929
930
  } else {
    return false;
  }
931
  return true;
932
933
}

Guolin Ke's avatar
Guolin Ke committed
934
void Dataset::SaveBinaryFile(const char* bin_filename) {
Guolin Ke's avatar
Guolin Ke committed
935
  if (bin_filename != nullptr && std::string(bin_filename) == data_filename_) {
936
    Log::Warning("Binary file %s already exists", bin_filename);
Guolin Ke's avatar
Guolin Ke committed
937
938
    return;
  }
Guolin Ke's avatar
Guolin Ke committed
939
  // if not pass a filename, just append ".bin" of original file
Guolin Ke's avatar
Guolin Ke committed
940
  std::string bin_filename_str(data_filename_);
Guolin Ke's avatar
Guolin Ke committed
941
942
943
944
  if (bin_filename == nullptr || bin_filename[0] == '\0') {
    bin_filename_str.append(".bin");
    bin_filename = bin_filename_str.c_str();
  }
Guolin Ke's avatar
Guolin Ke committed
945
  bool is_file_existed = false;
946
947

  if (VirtualFileWriter::Exists(bin_filename)) {
Guolin Ke's avatar
Guolin Ke committed
948
    is_file_existed = true;
949
    Log::Warning("File %s exists, cannot save binary to it", bin_filename);
Guolin Ke's avatar
Guolin Ke committed
950
  }
Guolin Ke's avatar
Guolin Ke committed
951

Guolin Ke's avatar
Guolin Ke committed
952
  if (!is_file_existed) {
953
954
    auto writer = VirtualFileWriter::Make(bin_filename);
    if (!writer->Init()) {
Guolin Ke's avatar
Guolin Ke committed
955
      Log::Fatal("Cannot write binary data to %s ", bin_filename);
Guolin Ke's avatar
Guolin Ke committed
956
    }
957
    Log::Info("Saving data to binary file %s", bin_filename);
958
    size_t size_of_token = std::strlen(binary_file_token);
959
    writer->AlignedWrite(binary_file_token, size_of_token);
Guolin Ke's avatar
Guolin Ke committed
960
    // get size of header
961
962
963
964
965
966
967
968
969
970
971
972
    size_t size_of_header =
        VirtualFileWriter::AlignedSize(sizeof(num_data_)) +
        VirtualFileWriter::AlignedSize(sizeof(num_features_)) +
        VirtualFileWriter::AlignedSize(sizeof(num_total_features_)) +
        VirtualFileWriter::AlignedSize(sizeof(int) * num_total_features_) +
        VirtualFileWriter::AlignedSize(sizeof(label_idx_)) +
        VirtualFileWriter::AlignedSize(sizeof(num_groups_)) +
        3 * VirtualFileWriter::AlignedSize(sizeof(int) * num_features_) +
        sizeof(uint64_t) * (num_groups_ + 1) +
        2 * VirtualFileWriter::AlignedSize(sizeof(int) * num_groups_) +
        VirtualFileWriter::AlignedSize(sizeof(int32_t) * num_total_features_) +
        VirtualFileWriter::AlignedSize(sizeof(int)) * 3 +
973
        VirtualFileWriter::AlignedSize(sizeof(bool)) * 3;
974
975
    // size of feature names
    for (int i = 0; i < num_total_features_; ++i) {
976
977
978
      size_of_header +=
          VirtualFileWriter::AlignedSize(feature_names_[i].size()) +
          VirtualFileWriter::AlignedSize(sizeof(int));
979
    }
980
981
    // size of forced bins
    for (int i = 0; i < num_total_features_; ++i) {
982
983
      size_of_header += forced_bin_bounds_[i].size() * sizeof(double) +
                        VirtualFileWriter::AlignedSize(sizeof(int));
984
    }
985
    writer->Write(&size_of_header, sizeof(size_of_header));
Guolin Ke's avatar
Guolin Ke committed
986
    // write header
987
988
989
990
991
992
993
994
995
996
    writer->AlignedWrite(&num_data_, sizeof(num_data_));
    writer->AlignedWrite(&num_features_, sizeof(num_features_));
    writer->AlignedWrite(&num_total_features_, sizeof(num_total_features_));
    writer->AlignedWrite(&label_idx_, sizeof(label_idx_));
    writer->AlignedWrite(&max_bin_, sizeof(max_bin_));
    writer->AlignedWrite(&bin_construct_sample_cnt_,
                         sizeof(bin_construct_sample_cnt_));
    writer->AlignedWrite(&min_data_in_bin_, sizeof(min_data_in_bin_));
    writer->AlignedWrite(&use_missing_, sizeof(use_missing_));
    writer->AlignedWrite(&zero_as_missing_, sizeof(zero_as_missing_));
997
    writer->AlignedWrite(&has_raw_, sizeof(has_raw_));
998
999
1000
1001
1002
1003
1004
    writer->AlignedWrite(used_feature_map_.data(),
                         sizeof(int) * num_total_features_);
    writer->AlignedWrite(&num_groups_, sizeof(num_groups_));
    writer->AlignedWrite(real_feature_idx_.data(), sizeof(int) * num_features_);
    writer->AlignedWrite(feature2group_.data(), sizeof(int) * num_features_);
    writer->AlignedWrite(feature2subfeature_.data(),
                         sizeof(int) * num_features_);
Guolin Ke's avatar
Guolin Ke committed
1005
1006
    writer->Write(group_bin_boundaries_.data(),
                  sizeof(uint64_t) * (num_groups_ + 1));
1007
1008
1009
    writer->AlignedWrite(group_feature_start_.data(),
                         sizeof(int) * num_groups_);
    writer->AlignedWrite(group_feature_cnt_.data(), sizeof(int) * num_groups_);
Belinda Trotta's avatar
Belinda Trotta committed
1010
1011
1012
    if (max_bin_by_feature_.empty()) {
      ArrayArgs<int32_t>::Assign(&max_bin_by_feature_, -1, num_total_features_);
    }
1013
    writer->AlignedWrite(max_bin_by_feature_.data(),
Guolin Ke's avatar
Guolin Ke committed
1014
                  sizeof(int32_t) * num_total_features_);
Belinda Trotta's avatar
Belinda Trotta committed
1015
1016
1017
    if (ArrayArgs<int32_t>::CheckAll(max_bin_by_feature_, -1)) {
      max_bin_by_feature_.clear();
    }
1018
1019
1020
    // write feature names
    for (int i = 0; i < num_total_features_; ++i) {
      int str_len = static_cast<int>(feature_names_[i].size());
1021
      writer->AlignedWrite(&str_len, sizeof(int));
1022
      const char* c_str = feature_names_[i].c_str();
1023
      writer->AlignedWrite(c_str, sizeof(char) * str_len);
1024
    }
1025
1026
1027
    // write forced bins
    for (int i = 0; i < num_total_features_; ++i) {
      int num_bounds = static_cast<int>(forced_bin_bounds_[i].size());
1028
      writer->AlignedWrite(&num_bounds, sizeof(int));
1029

1030
1031
1032
1033
      for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) {
        writer->Write(&forced_bin_bounds_[i][j], sizeof(double));
      }
    }
1034

Guolin Ke's avatar
Guolin Ke committed
1035
1036
    // get size of meta data
    size_t size_of_metadata = metadata_.SizesInByte();
1037
    writer->Write(&size_of_metadata, sizeof(size_of_metadata));
Guolin Ke's avatar
Guolin Ke committed
1038
    // write meta data
1039
    metadata_.SaveBinaryToFile(writer.get());
Guolin Ke's avatar
Guolin Ke committed
1040
1041

    // write feature data
Guolin Ke's avatar
Guolin Ke committed
1042
    for (int i = 0; i < num_groups_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
1043
      // get size of feature
Guolin Ke's avatar
Guolin Ke committed
1044
      size_t size_of_feature = feature_groups_[i]->SizesInByte();
1045
      writer->Write(&size_of_feature, sizeof(size_of_feature));
Guolin Ke's avatar
Guolin Ke committed
1046
      // write feature
1047
      feature_groups_[i]->SaveBinaryToFile(writer.get());
Guolin Ke's avatar
Guolin Ke committed
1048
    }
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060

    // write raw data; use row-major order so we can read row-by-row
    if (has_raw_) {
      for (int i = 0; i < num_data_; ++i) {
        for (int j = 0; j < num_features_; ++j) {
          int feat_ind = numeric_feature_map_[j];
          if (feat_ind > -1) {
            writer->Write(&raw_data_[feat_ind][i], sizeof(float));
          }
        }
      }
    }
Guolin Ke's avatar
Guolin Ke committed
1061
1062
1063
  }
}

1064
void Dataset::DumpTextFile(const char* text_filename) {
Guolin Ke's avatar
Guolin Ke committed
1065
1066
1067
1068
1069
1070
  FILE* file = NULL;
#if _MSC_VER
  fopen_s(&file, text_filename, "wt");
#else
  file = fopen(text_filename, "wt");
#endif
1071
1072
1073
1074
1075
  fprintf(file, "num_features: %d\n", num_features_);
  fprintf(file, "num_total_features: %d\n", num_total_features_);
  fprintf(file, "num_groups: %d\n", num_groups_);
  fprintf(file, "num_data: %d\n", num_data_);
  fprintf(file, "feature_names: ");
1076
  for (auto n : feature_names_) {
1077
1078
    fprintf(file, "%s, ", n.c_str());
  }
Belinda Trotta's avatar
Belinda Trotta committed
1079
1080
1081
1082
  fprintf(file, "\nmax_bin_by_feature: ");
  for (auto i : max_bin_by_feature_) {
    fprintf(file, "%d, ", i);
  }
1083
  fprintf(file, "\n");
1084
  for (auto n : feature_names_) {
1085
1086
    fprintf(file, "%s, ", n.c_str());
  }
1087
1088
1089
1090
1091
1092
1093
  fprintf(file, "\nforced_bins: ");
  for (int i = 0; i < num_total_features_; ++i) {
    fprintf(file, "\nfeature %d: ", i);
    for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) {
      fprintf(file, "%lf, ", forced_bin_bounds_[i][j]);
    }
  }
1094
1095
  std::vector<std::unique_ptr<BinIterator>> iterators;
  iterators.reserve(num_features_);
1096
  for (int j = 0; j < num_features_; ++j) {
1097
1098
    auto group_idx = feature2group_[j];
    auto sub_idx = feature2subfeature_[j];
Guolin Ke's avatar
Guolin Ke committed
1099
1100
    iterators.emplace_back(
        feature_groups_[group_idx]->SubFeatureIterator(sub_idx));
1101
  }
1102
  for (data_size_t i = 0; i < num_data_; ++i) {
1103
    fprintf(file, "\n");
1104
    for (int j = 0; j < num_total_features_; ++j) {
1105
      auto inner_feature_idx = used_feature_map_[j];
1106
1107
      if (inner_feature_idx < 0) {
        fprintf(file, "NA, ");
1108
      } else {
Guolin Ke's avatar
Guolin Ke committed
1109
        fprintf(file, "%d, ", iterators[inner_feature_idx]->Get(i));
1110
1111
1112
1113
1114
1115
      }
    }
  }
  fclose(file);
}

1116
void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used,
1117
                        TrainingShareStates* share_state) const {
1118
  Common::FunctionTimer fun_time("Dataset::InitTrain", global_timer);
1119
1120
1121
  share_state->InitTrain(group_feature_start_,
        feature_groups_,
        is_feature_used);
1122
1123
}

Guolin Ke's avatar
Guolin Ke committed
1124
template <bool USE_INDICES, bool ORDERED>
1125
1126
1127
1128
1129
1130
void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices,
                                          data_size_t num_data,
                                          const score_t* gradients,
                                          const score_t* hessians,
                                          TrainingShareStates* share_state,
                                          hist_t* hist_data) const {
1131
1132
  Common::FunctionTimer fun_time("Dataset::ConstructHistogramsMultiVal",
                                 global_timer);
1133
1134
  share_state->ConstructHistograms<USE_INDICES, ORDERED>(
      data_indices, num_data, gradients, hessians, hist_data);
1135
1136
}

Guolin Ke's avatar
Guolin Ke committed
1137
1138
template <bool USE_INDICES, bool USE_HESSIAN>
void Dataset::ConstructHistogramsInner(
1139
1140
1141
    const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
    data_size_t num_data, const score_t* gradients, const score_t* hessians,
    score_t* ordered_gradients, score_t* ordered_hessians,
1142
    TrainingShareStates* share_state, hist_t* hist_data) const {
1143
  if (!share_state->is_col_wise) {
Guolin Ke's avatar
Guolin Ke committed
1144
1145
    return ConstructHistogramsMultiVal<USE_INDICES, false>(
        data_indices, num_data, gradients, hessians, share_state, hist_data);
1146
1147
1148
1149
  }
  std::vector<int> used_dense_group;
  int multi_val_groud_id = -1;
  used_dense_group.reserve(num_groups_);
Guolin Ke's avatar
Guolin Ke committed
1150
  for (int group = 0; group < num_groups_; ++group) {
Guolin Ke's avatar
Guolin Ke committed
1151
    const int f_start = group_feature_start_[group];
Guolin Ke's avatar
Guolin Ke committed
1152
    const int f_cnt = group_feature_cnt_[group];
1153
    bool is_group_used = false;
Guolin Ke's avatar
Guolin Ke committed
1154
    for (int j = 0; j < f_cnt; ++j) {
Guolin Ke's avatar
Guolin Ke committed
1155
      const int fidx = f_start + j;
Guolin Ke's avatar
Guolin Ke committed
1156
      if (is_feature_used[fidx]) {
1157
        is_group_used = true;
Guolin Ke's avatar
Guolin Ke committed
1158
1159
1160
        break;
      }
    }
1161
    if (is_group_used) {
1162
1163
1164
1165
      if (feature_groups_[group]->is_multi_val_) {
        multi_val_groud_id = group;
      } else {
        used_dense_group.push_back(group);
1166
      }
Guolin Ke's avatar
Guolin Ke committed
1167
    }
1168
1169
1170
  }
  int num_used_dense_group = static_cast<int>(used_dense_group.size());
  global_timer.Start("Dataset::dense_bin_histogram");
Guolin Ke's avatar
Guolin Ke committed
1171
1172
  auto ptr_ordered_grad = gradients;
  auto ptr_ordered_hess = hessians;
1173
  if (num_used_dense_group > 0) {
Guolin Ke's avatar
Guolin Ke committed
1174
1175
    if (USE_INDICES) {
      if (USE_HESSIAN) {
Guolin Ke's avatar
Guolin Ke committed
1176
#pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
1177
1178
1179
1180
        for (data_size_t i = 0; i < num_data; ++i) {
          ordered_gradients[i] = gradients[data_indices[i]];
          ordered_hessians[i] = hessians[data_indices[i]];
        }
Guolin Ke's avatar
Guolin Ke committed
1181
1182
        ptr_ordered_grad = ordered_gradients;
        ptr_ordered_hess = ordered_hessians;
1183
      } else {
Guolin Ke's avatar
Guolin Ke committed
1184
#pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
1185
1186
        for (data_size_t i = 0; i < num_data; ++i) {
          ordered_gradients[i] = gradients[data_indices[i]];
1187
        }
Guolin Ke's avatar
Guolin Ke committed
1188
        ptr_ordered_grad = ordered_gradients;
1189
      }
Guolin Ke's avatar
Guolin Ke committed
1190
1191
    }
    OMP_INIT_EX();
1192
#pragma omp parallel for schedule(static) num_threads(share_state->num_threads)
Guolin Ke's avatar
Guolin Ke committed
1193
1194
1195
1196
1197
1198
1199
1200
1201
    for (int gi = 0; gi < num_used_dense_group; ++gi) {
      OMP_LOOP_EX_BEGIN();
      int group = used_dense_group[gi];
      auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
      const int num_bin = feature_groups_[group]->num_total_bin_;
      std::memset(reinterpret_cast<void*>(data_ptr), 0,
                  num_bin * kHistEntrySize);
      if (USE_HESSIAN) {
        if (USE_INDICES) {
1202
          feature_groups_[group]->bin_data_->ConstructHistogram(
1203
1204
              data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess,
              data_ptr);
Guolin Ke's avatar
Guolin Ke committed
1205
        } else {
1206
          feature_groups_[group]->bin_data_->ConstructHistogram(
1207
              0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
1208
        }
1209
      } else {
Guolin Ke's avatar
Guolin Ke committed
1210
1211
1212
1213
        if (USE_INDICES) {
          feature_groups_[group]->bin_data_->ConstructHistogram(
              data_indices, 0, num_data, ptr_ordered_grad, data_ptr);
        } else {
1214
1215
          feature_groups_[group]->bin_data_->ConstructHistogram(
              0, num_data, ptr_ordered_grad, data_ptr);
1216
        }
Guolin Ke's avatar
Guolin Ke committed
1217
1218
1219
1220
        auto cnt_dst = reinterpret_cast<hist_cnt_t*>(data_ptr + 1);
        for (int i = 0; i < num_bin * 2; i += 2) {
          data_ptr[i + 1] = static_cast<double>(cnt_dst[i]) * hessians[0];
        }
1221
      }
Guolin Ke's avatar
Guolin Ke committed
1222
      OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
1223
    }
Guolin Ke's avatar
Guolin Ke committed
1224
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
1225
  }
1226
1227
  global_timer.Stop("Dataset::dense_bin_histogram");
  if (multi_val_groud_id >= 0) {
Guolin Ke's avatar
Guolin Ke committed
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
    if (num_used_dense_group > 0) {
      ConstructHistogramsMultiVal<USE_INDICES, true>(
          data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess,
          share_state,
          hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
    } else {
      ConstructHistogramsMultiVal<USE_INDICES, false>(
          data_indices, num_data, gradients, hessians, share_state,
          hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
    }
1238
  }
Guolin Ke's avatar
Guolin Ke committed
1239
1240
}

James Lamb's avatar
James Lamb committed
1241
// explicitly initialize template methods, for cross module call
Guolin Ke's avatar
Guolin Ke committed
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
template void Dataset::ConstructHistogramsInner<true, true>(
    const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
    data_size_t num_data, const score_t* gradients, const score_t* hessians,
    score_t* ordered_gradients, score_t* ordered_hessians,
    TrainingShareStates* share_state, hist_t* hist_data) const;

template void Dataset::ConstructHistogramsInner<true, false>(
    const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
    data_size_t num_data, const score_t* gradients, const score_t* hessians,
    score_t* ordered_gradients, score_t* ordered_hessians,
    TrainingShareStates* share_state, hist_t* hist_data) const;

template void Dataset::ConstructHistogramsInner<false, true>(
    const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
    data_size_t num_data, const score_t* gradients, const score_t* hessians,
    score_t* ordered_gradients, score_t* ordered_hessians,
    TrainingShareStates* share_state, hist_t* hist_data) const;

template void Dataset::ConstructHistogramsInner<false, false>(
    const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
    data_size_t num_data, const score_t* gradients, const score_t* hessians,
    score_t* ordered_gradients, score_t* ordered_hessians,
    TrainingShareStates* share_state, hist_t* hist_data) const;

Guolin Ke's avatar
Guolin Ke committed
1266
1267
void Dataset::FixHistogram(int feature_idx, double sum_gradient,
                           double sum_hessian, hist_t* data) const {
Guolin Ke's avatar
Guolin Ke committed
1268
1269
  const int group = feature2group_[feature_idx];
  const int sub_feature = feature2subfeature_[feature_idx];
Guolin Ke's avatar
Guolin Ke committed
1270
1271
  const BinMapper* bin_mapper =
      feature_groups_[group]->bin_mappers_[sub_feature].get();
Guolin Ke's avatar
Guolin Ke committed
1272
1273
  const int most_freq_bin = bin_mapper->GetMostFreqBin();
  if (most_freq_bin > 0) {
Guolin Ke's avatar
Guolin Ke committed
1274
    const int num_bin = bin_mapper->num_bin();
1275
1276
    GET_GRAD(data, most_freq_bin) = sum_gradient;
    GET_HESS(data, most_freq_bin) = sum_hessian;
Guolin Ke's avatar
Guolin Ke committed
1277
    for (int i = 0; i < num_bin; ++i) {
Guolin Ke's avatar
Guolin Ke committed
1278
      if (i != most_freq_bin) {
1279
1280
        GET_GRAD(data, most_freq_bin) -= GET_GRAD(data, i);
        GET_HESS(data, most_freq_bin) -= GET_HESS(data, i);
Guolin Ke's avatar
Guolin Ke committed
1281
1282
1283
1284
1285
      }
    }
  }
}

Guolin Ke's avatar
Guolin Ke committed
1286
template <typename T>
Guolin Ke's avatar
Guolin Ke committed
1287
1288
void PushVector(std::vector<T>* dest, const std::vector<T>& src) {
  dest->reserve(dest->size() + src.size());
1289
  for (auto i : src) {
Guolin Ke's avatar
Guolin Ke committed
1290
    dest->push_back(i);
1291
1292
1293
  }
}

Guolin Ke's avatar
Guolin Ke committed
1294
1295
1296
template <typename T>
void PushOffset(std::vector<T>* dest, const std::vector<T>& src,
                const T& offset) {
Guolin Ke's avatar
Guolin Ke committed
1297
  dest->reserve(dest->size() + src.size());
1298
  for (auto i : src) {
Guolin Ke's avatar
Guolin Ke committed
1299
    dest->push_back(i + offset);
1300
1301
1302
  }
}

Guolin Ke's avatar
Guolin Ke committed
1303
1304
1305
1306
template <typename T>
void PushClearIfEmpty(std::vector<T>* dest, const size_t dest_len,
                      const std::vector<T>& src, const size_t src_len,
                      const T& deflt) {
Guolin Ke's avatar
Guolin Ke committed
1307
  if (!dest->empty() && !src.empty()) {
1308
    PushVector(dest, src);
Guolin Ke's avatar
Guolin Ke committed
1309
  } else if (!dest->empty() && src.empty()) {
1310
    for (size_t i = 0; i < src_len; ++i) {
Guolin Ke's avatar
Guolin Ke committed
1311
      dest->push_back(deflt);
1312
    }
Guolin Ke's avatar
Guolin Ke committed
1313
  } else if (dest->empty() && !src.empty()) {
1314
    for (size_t i = 0; i < dest_len; ++i) {
Guolin Ke's avatar
Guolin Ke committed
1315
      dest->push_back(deflt);
1316
1317
1318
1319
1320
    }
    PushVector(dest, src);
  }
}

1321
void Dataset::AddFeaturesFrom(Dataset* other) {
1322
  if (other->num_data_ != num_data_) {
1323
    Log::Fatal(
Guolin Ke's avatar
Guolin Ke committed
1324
1325
        "Cannot add features from other Dataset with a different number of "
        "rows");
1326
  }
1327
1328
1329
  if (other->has_raw_ != has_raw_) {
    Log::Fatal("Can only add features from other Dataset if both or neither have raw data.");
  }
Guolin Ke's avatar
Guolin Ke committed
1330
1331
1332
1333
1334
  int mv_gid = -1;
  int other_mv_gid = -1;
  for (int i = 0; i < num_groups_; ++i) {
    if (IsMultiGroup(i)) {
      mv_gid = i;
1335
1336
    }
  }
Guolin Ke's avatar
Guolin Ke committed
1337
1338
1339
1340
  for (int i = 0; i < other->num_groups_; ++i) {
    if (other->IsMultiGroup(i)) {
      other_mv_gid = i;
    }
1341
  }
Guolin Ke's avatar
Guolin Ke committed
1342
1343
1344
1345
1346
1347
  // Only one multi-val group, just simply merge
  if (mv_gid < 0 || other_mv_gid < 0) {
    PushVector(&feature2subfeature_, other->feature2subfeature_);
    PushVector(&group_feature_cnt_, other->group_feature_cnt_);
    feature_groups_.reserve(other->feature_groups_.size());
    for (auto& fg : other->feature_groups_) {
1348
1349
      const int cur_group_id = static_cast<int>(feature_groups_.size());
      feature_groups_.emplace_back(new FeatureGroup(*fg, true, cur_group_id));
Guolin Ke's avatar
Guolin Ke committed
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
    }
    for (auto feature_idx : other->used_feature_map_) {
      if (feature_idx >= 0) {
        used_feature_map_.push_back(feature_idx + num_features_);
      } else {
        used_feature_map_.push_back(-1);  // Unused feature.
      }
    }
    PushOffset(&real_feature_idx_, other->real_feature_idx_,
               num_total_features_);
    PushOffset(&feature2group_, other->feature2group_, num_groups_);
    auto bin_offset = group_bin_boundaries_.back();
    // Skip the leading 0 when copying group_bin_boundaries.
    for (auto i = other->group_bin_boundaries_.begin() + 1;
         i < other->group_bin_boundaries_.end(); ++i) {
      group_bin_boundaries_.push_back(*i + bin_offset);
    }
    PushOffset(&group_feature_start_, other->group_feature_start_,
               num_features_);
    num_groups_ += other->num_groups_;
    num_features_ += other->num_features_;
  } else {
    std::vector<std::vector<int>> features_in_group;
    for (int i = 0; i < num_groups_; ++i) {
      int f_start = group_feature_start_[i];
      int f_cnt = group_feature_cnt_[i];
      features_in_group.emplace_back();
      for (int j = 0; j < f_cnt; ++j) {
1378
1379
        const int real_fidx = real_feature_idx_[f_start + j];
        features_in_group.back().push_back(real_fidx);
Guolin Ke's avatar
Guolin Ke committed
1380
1381
1382
      }
    }
    feature_groups_[mv_gid]->AddFeaturesFrom(
1383
        other->feature_groups_[other_mv_gid].get(), mv_gid);
Guolin Ke's avatar
Guolin Ke committed
1384
1385
1386
1387
1388
    for (int i = 0; i < other->num_groups_; ++i) {
      int f_start = other->group_feature_start_[i];
      int f_cnt = other->group_feature_cnt_[i];
      if (i == other_mv_gid) {
        for (int j = 0; j < f_cnt; ++j) {
1389
1390
          const int real_fidx = other->real_feature_idx_[f_start + j] + num_total_features_;
          features_in_group[mv_gid].push_back(real_fidx);
Guolin Ke's avatar
Guolin Ke committed
1391
1392
1393
1394
        }
      } else {
        features_in_group.emplace_back();
        for (int j = 0; j < f_cnt; ++j) {
1395
1396
          const int real_fidx = other->real_feature_idx_[f_start + j] + num_total_features_;
          features_in_group.back().push_back(real_fidx);
Guolin Ke's avatar
Guolin Ke committed
1397
1398
        }
        feature_groups_.emplace_back(
1399
            new FeatureGroup(*other->feature_groups_[i], false, -1));
Guolin Ke's avatar
Guolin Ke committed
1400
1401
1402
1403
1404
1405
1406
      }
    }
    // regenerate other fields
    num_groups_ += other->num_groups_ - 1;
    CHECK(num_groups_ == static_cast<int>(features_in_group.size()));
    num_features_ += other->num_features_;
    int cur_fidx = 0;
1407
1408
    used_feature_map_ =
      std::vector<int>(num_total_features_ + other->num_total_features_, -1);
Guolin Ke's avatar
Guolin Ke committed
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
    real_feature_idx_.resize(num_features_);
    feature2group_.resize(num_features_);
    feature2subfeature_.resize(num_features_);
    group_feature_start_.resize(num_groups_);
    group_feature_cnt_.resize(num_groups_);

    group_bin_boundaries_.clear();
    uint64_t num_total_bin = 0;
    group_bin_boundaries_.push_back(num_total_bin);
    for (int i = 0; i < num_groups_; ++i) {
      auto cur_features = features_in_group[i];
      int cur_cnt_features = static_cast<int>(cur_features.size());
      group_feature_start_[i] = cur_fidx;
      group_feature_cnt_[i] = cur_cnt_features;
      for (int j = 0; j < cur_cnt_features; ++j) {
        int real_fidx = cur_features[j];
        used_feature_map_[real_fidx] = cur_fidx;
        real_feature_idx_[cur_fidx] = real_fidx;
        feature2group_[cur_fidx] = i;
        feature2subfeature_[cur_fidx] = j;
        ++cur_fidx;
      }
      num_total_bin += feature_groups_[i]->num_total_bin_;
      group_bin_boundaries_.push_back(num_total_bin);
    }
  }
  std::unordered_set<std::string> feature_names_set;
  for (const auto& val : feature_names_) {
    feature_names_set.emplace(val);
  }
  for (const auto& val : other->feature_names_) {
    std::string new_name = val;
    int cnt = 2;
    while (feature_names_set.count(new_name)) {
      new_name = "D" + std::to_string(cnt) + "_" + val;
      ++cnt;
    }
    if (new_name != val) {
      Log::Warning(
        "Find the same feature name (%s) in Dataset::AddFeaturesFrom, change "
        "its name to (%s)",
        val.c_str(), new_name.c_str());
    }
    feature_names_set.emplace(new_name);
    feature_names_.push_back(new_name);
  }
  PushVector(&forced_bin_bounds_, other->forced_bin_bounds_);
  PushClearIfEmpty(&max_bin_by_feature_, num_total_features_,
                   other->max_bin_by_feature_, other->num_total_features_, -1);
1458
  num_total_features_ += other->num_total_features_;
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
  for (size_t i = 0; i < (other->numeric_feature_map_).size(); ++i) {
    int feat_ind = numeric_feature_map_[i];
    if (feat_ind > -1) {
      numeric_feature_map_.push_back(feat_ind + num_numeric_features_);
    } else {
      numeric_feature_map_.push_back(-1);
    }
  }
  num_numeric_features_ += other->num_numeric_features_;
  if (has_raw_) {
    for (int i = 0; i < other->num_numeric_features_; ++i) {
      raw_data_.push_back(other->raw_data_[i]);
    }
  }
1473
1474
}

Guolin Ke's avatar
Guolin Ke committed
1475
}  // namespace LightGBM