dataset.cpp 54.9 KB
Newer Older
1
2
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
Guolin Ke's avatar
Guolin Ke committed
3
4
 * Licensed under the MIT License. See LICENSE file in the project root for
 * license information.
5
 */
6
7
8
#include <LightGBM/dataset.h>

#include <LightGBM/feature_group.h>
9
#include <LightGBM/cuda/vector_cudahost.h>
10
11
12
13
#include <LightGBM/utils/array_args.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/utils/threading.h>

14
15
16
17
18
19
#include <chrono>
#include <cstdio>
#include <limits>
#include <sstream>
#include <unordered_map>

Guolin Ke's avatar
Guolin Ke committed
20
21
namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
22
23
const char* Dataset::binary_file_token =
    "______LightGBM_Binary_File_Token______\n";
Guolin Ke's avatar
Guolin Ke committed
24

Guolin Ke's avatar
Guolin Ke committed
25
Dataset::Dataset() {
26
  data_filename_ = "noname";
Guolin Ke's avatar
Guolin Ke committed
27
  num_data_ = 0;
Guolin Ke's avatar
Guolin Ke committed
28
  is_finish_load_ = false;
29
  has_raw_ = false;
Guolin Ke's avatar
Guolin Ke committed
30
31
}

32
Dataset::Dataset(data_size_t num_data) {
33
  CHECK_GT(num_data, 0);
Guolin Ke's avatar
Guolin Ke committed
34
  data_filename_ = "noname";
Guolin Ke's avatar
Guolin Ke committed
35
  num_data_ = num_data;
Guolin Ke's avatar
Guolin Ke committed
36
  metadata_.Init(num_data_, NO_SPECIFIC, NO_SPECIFIC);
Guolin Ke's avatar
Guolin Ke committed
37
  is_finish_load_ = false;
Guolin Ke's avatar
Guolin Ke committed
38
  group_bin_boundaries_.push_back(0);
39
  has_raw_ = false;
Guolin Ke's avatar
Guolin Ke committed
40
41
}

Guolin Ke's avatar
Guolin Ke committed
42
Dataset::~Dataset() {}
Guolin Ke's avatar
Guolin Ke committed
43

Guolin Ke's avatar
Guolin Ke committed
44
std::vector<std::vector<int>> NoGroup(const std::vector<int>& used_features) {
Guolin Ke's avatar
Guolin Ke committed
45
46
47
48
49
50
51
52
  std::vector<std::vector<int>> features_in_group;
  features_in_group.resize(used_features.size());
  for (size_t i = 0; i < used_features.size(); ++i) {
    features_in_group[i].emplace_back(used_features[i]);
  }
  return features_in_group;
}

guanqun's avatar
guanqun committed
53
int GetConflictCount(const std::vector<bool>& mark, const int* indices,
Guolin Ke's avatar
Guolin Ke committed
54
                     int num_indices, data_size_t max_cnt) {
Guolin Ke's avatar
Guolin Ke committed
55
56
57
58
  int ret = 0;
  for (int i = 0; i < num_indices; ++i) {
    if (mark[indices[i]]) {
      ++ret;
59
60
61
    }
    if (ret > max_cnt) {
      return -1;
Guolin Ke's avatar
Guolin Ke committed
62
63
64
65
    }
  }
  return ret;
}
66

Guolin Ke's avatar
Guolin Ke committed
67
68
void MarkUsed(std::vector<bool>* mark, const int* indices,
              data_size_t num_indices) {
Guolin Ke's avatar
Guolin Ke committed
69
  auto& ref_mark = *mark;
Guolin Ke's avatar
Guolin Ke committed
70
  for (int i = 0; i < num_indices; ++i) {
Guolin Ke's avatar
Guolin Ke committed
71
    ref_mark[indices[i]] = true;
Guolin Ke's avatar
Guolin Ke committed
72
73
74
  }
}

Guolin Ke's avatar
Guolin Ke committed
75
76
77
78
std::vector<int> FixSampleIndices(const BinMapper* bin_mapper,
                                  int num_total_samples, int num_indices,
                                  const int* sample_indices,
                                  const double* sample_values) {
Guolin Ke's avatar
Guolin Ke committed
79
80
81
82
83
84
85
86
87
  std::vector<int> ret;
  if (bin_mapper->GetDefaultBin() == bin_mapper->GetMostFreqBin()) {
    return ret;
  }
  int i = 0, j = 0;
  while (i < num_total_samples) {
    if (j < num_indices && sample_indices[j] < i) {
      ++j;
    } else if (j < num_indices && sample_indices[j] == i) {
Guolin Ke's avatar
Guolin Ke committed
88
89
      if (bin_mapper->ValueToBin(sample_values[j]) !=
          bin_mapper->GetMostFreqBin()) {
Guolin Ke's avatar
Guolin Ke committed
90
91
92
93
94
95
96
97
98
99
        ret.push_back(i);
      }
      ++i;
    } else {
      ret.push_back(i++);
    }
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
100
101
102
103
104
105
std::vector<std::vector<int>> FindGroups(
    const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
    const std::vector<int>& find_order, int** sample_indices,
    const int* num_per_col, int num_sample_col, data_size_t total_sample_cnt,
    data_size_t num_data, bool is_use_gpu, bool is_sparse,
    std::vector<int8_t>* multi_val_group) {
Guolin Ke's avatar
Guolin Ke committed
106
  const int max_search_group = 100;
107
  const int max_bin_per_group = 256;
Guolin Ke's avatar
Guolin Ke committed
108
109
  const data_size_t single_val_max_conflict_cnt =
      static_cast<data_size_t>(total_sample_cnt / 10000);
110
111
  multi_val_group->clear();

Guolin Ke's avatar
Guolin Ke committed
112
113
114
  Random rand(num_data);
  std::vector<std::vector<int>> features_in_group;
  std::vector<std::vector<bool>> conflict_marks;
115
116
  std::vector<data_size_t> group_used_row_cnt;
  std::vector<data_size_t> group_total_data_cnt;
Guolin Ke's avatar
Guolin Ke committed
117
118
  std::vector<int> group_num_bin;

119
  // first round: fill the single val group
Guolin Ke's avatar
Guolin Ke committed
120
  for (auto fidx : find_order) {
121
    bool is_filtered_feature = fidx >= num_sample_col;
Guolin Ke's avatar
Guolin Ke committed
122
123
    const data_size_t cur_non_zero_cnt =
        is_filtered_feature ? 0 : num_per_col[fidx];
Guolin Ke's avatar
Guolin Ke committed
124
125
    std::vector<int> available_groups;
    for (int gid = 0; gid < static_cast<int>(features_in_group.size()); ++gid) {
Guolin Ke's avatar
Guolin Ke committed
126
127
128
129
      auto cur_num_bin = group_num_bin[gid] + bin_mappers[fidx]->num_bin() +
                         (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0);
      if (group_total_data_cnt[gid] + cur_non_zero_cnt <=
          total_sample_cnt + single_val_max_conflict_cnt) {
130
        if (!is_use_gpu || cur_num_bin <= max_bin_per_group) {
Guolin Ke's avatar
Guolin Ke committed
131
132
          available_groups.push_back(gid);
        }
Guolin Ke's avatar
Guolin Ke committed
133
134
135
136
137
138
      }
    }
    std::vector<int> search_groups;
    if (!available_groups.empty()) {
      int last = static_cast<int>(available_groups.size()) - 1;
      auto indices = rand.Sample(last, std::min(last, max_search_group - 1));
139
      // always push the last group
Guolin Ke's avatar
Guolin Ke committed
140
141
142
143
144
      search_groups.push_back(available_groups.back());
      for (auto idx : indices) {
        search_groups.push_back(available_groups[idx]);
      }
    }
145
146
    int best_gid = -1;
    int best_conflict_cnt = -1;
Guolin Ke's avatar
Guolin Ke committed
147
    for (auto gid : search_groups) {
Guolin Ke's avatar
Guolin Ke committed
148
149
150
151
152
153
      const data_size_t rest_max_cnt = single_val_max_conflict_cnt -
                                       group_total_data_cnt[gid] +
                                       group_used_row_cnt[gid];
      const data_size_t cnt =
          is_filtered_feature
              ? 0
guanqun's avatar
guanqun committed
154
              : GetConflictCount(conflict_marks[gid], sample_indices[fidx],
Guolin Ke's avatar
Guolin Ke committed
155
                                 num_per_col[fidx], rest_max_cnt);
156
157
158
      if (cnt >= 0 && cnt <= rest_max_cnt && cnt <= cur_non_zero_cnt / 2) {
        best_gid = gid;
        best_conflict_cnt = cnt;
Guolin Ke's avatar
Guolin Ke committed
159
160
161
        break;
      }
    }
162
163
164
165
166
    if (best_gid >= 0) {
      features_in_group[best_gid].push_back(fidx);
      group_total_data_cnt[best_gid] += cur_non_zero_cnt;
      group_used_row_cnt[best_gid] += cur_non_zero_cnt - best_conflict_cnt;
      if (!is_filtered_feature) {
Guolin Ke's avatar
Guolin Ke committed
167
168
        MarkUsed(&conflict_marks[best_gid], sample_indices[fidx],
                 num_per_col[fidx]);
169
      }
Guolin Ke's avatar
Guolin Ke committed
170
171
172
      group_num_bin[best_gid] +=
          bin_mappers[fidx]->num_bin() +
          (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0);
173
    } else {
Guolin Ke's avatar
Guolin Ke committed
174
175
176
      features_in_group.emplace_back();
      features_in_group.back().push_back(fidx);
      conflict_marks.emplace_back(total_sample_cnt, false);
177
      if (!is_filtered_feature) {
Guolin Ke's avatar
Guolin Ke committed
178
179
        MarkUsed(&(conflict_marks.back()), sample_indices[fidx],
                 num_per_col[fidx]);
180
      }
181
182
      group_total_data_cnt.emplace_back(cur_non_zero_cnt);
      group_used_row_cnt.emplace_back(cur_non_zero_cnt);
Guolin Ke's avatar
Guolin Ke committed
183
184
185
      group_num_bin.push_back(
          1 + bin_mappers[fidx]->num_bin() +
          (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0));
186
187
    }
  }
Guolin Ke's avatar
Guolin Ke committed
188
189
190
191
  if (!is_sparse) {
    multi_val_group->resize(features_in_group.size(), false);
    return features_in_group;
  }
192
193
194
195
196
197
  std::vector<int> second_round_features;
  std::vector<std::vector<int>> features_in_group2;
  std::vector<std::vector<bool>> conflict_marks2;

  const double dense_threshold = 0.4;
  for (int gid = 0; gid < static_cast<int>(features_in_group.size()); ++gid) {
Guolin Ke's avatar
Guolin Ke committed
198
199
    const double dense_rate =
        static_cast<double>(group_used_row_cnt[gid]) / total_sample_cnt;
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
    if (dense_rate >= dense_threshold) {
      features_in_group2.push_back(std::move(features_in_group[gid]));
      conflict_marks2.push_back(std::move(conflict_marks[gid]));
    } else {
      for (auto fidx : features_in_group[gid]) {
        second_round_features.push_back(fidx);
      }
    }
  }

  features_in_group = features_in_group2;
  conflict_marks = conflict_marks2;
  multi_val_group->resize(features_in_group.size(), false);
  if (!second_round_features.empty()) {
    features_in_group.emplace_back();
    conflict_marks.emplace_back(total_sample_cnt, false);
    bool is_multi_val = is_use_gpu ? true : false;
    int conflict_cnt = 0;
    for (auto fidx : second_round_features) {
      features_in_group.back().push_back(fidx);
      if (!is_multi_val) {
        const int rest_max_cnt = single_val_max_conflict_cnt - conflict_cnt;
Guolin Ke's avatar
Guolin Ke committed
222
        const auto cnt =
guanqun's avatar
guanqun committed
223
            GetConflictCount(conflict_marks.back(), sample_indices[fidx],
Guolin Ke's avatar
Guolin Ke committed
224
                             num_per_col[fidx], rest_max_cnt);
225
226
227
228
229
        conflict_cnt += cnt;
        if (cnt < 0 || conflict_cnt > single_val_max_conflict_cnt) {
          is_multi_val = true;
          continue;
        }
Guolin Ke's avatar
Guolin Ke committed
230
231
        MarkUsed(&(conflict_marks.back()), sample_indices[fidx],
                 num_per_col[fidx]);
Guolin Ke's avatar
Guolin Ke committed
232
      }
Guolin Ke's avatar
Guolin Ke committed
233
    }
234
    multi_val_group->push_back(is_multi_val);
Guolin Ke's avatar
Guolin Ke committed
235
236
237
238
  }
  return features_in_group;
}

Guolin Ke's avatar
Guolin Ke committed
239
240
241
242
243
244
std::vector<std::vector<int>> FastFeatureBundling(
    const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
    int** sample_indices, double** sample_values, const int* num_per_col,
    int num_sample_col, data_size_t total_sample_cnt,
    const std::vector<int>& used_features, data_size_t num_data,
    bool is_use_gpu, bool is_sparse, std::vector<int8_t>* multi_val_group) {
245
  Common::FunctionTimer fun_timer("Dataset::FastFeatureBundling", global_timer);
Guolin Ke's avatar
Guolin Ke committed
246
  std::vector<size_t> feature_non_zero_cnt;
247
  feature_non_zero_cnt.reserve(used_features.size());
Guolin Ke's avatar
Guolin Ke committed
248
249
  // put dense feature first
  for (auto fidx : used_features) {
250
251
252
253
254
    if (fidx < num_sample_col) {
      feature_non_zero_cnt.emplace_back(num_per_col[fidx]);
    } else {
      feature_non_zero_cnt.emplace_back(0);
    }
Guolin Ke's avatar
Guolin Ke committed
255
256
257
  }
  // sort by non zero cnt
  std::vector<int> sorted_idx;
258
  sorted_idx.reserve(used_features.size());
259
  for (int i = 0; i < static_cast<int>(used_features.size()); ++i) {
Guolin Ke's avatar
Guolin Ke committed
260
261
262
    sorted_idx.emplace_back(i);
  }
  // sort by non zero cnt, bigger first
263
264
  std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
                   [&feature_non_zero_cnt](int a, int b) {
Guolin Ke's avatar
Guolin Ke committed
265
266
                     return feature_non_zero_cnt[a] > feature_non_zero_cnt[b];
                   });
Guolin Ke's avatar
Guolin Ke committed
267
268

  std::vector<int> feature_order_by_cnt;
269
  feature_order_by_cnt.reserve(sorted_idx.size());
Guolin Ke's avatar
Guolin Ke committed
270
271
272
  for (auto sidx : sorted_idx) {
    feature_order_by_cnt.push_back(used_features[sidx]);
  }
273

Guolin Ke's avatar
Guolin Ke committed
274
275
276
277
278
279
  std::vector<std::vector<int>> tmp_indices;
  std::vector<int> tmp_num_per_col(num_sample_col, 0);
  for (auto fidx : used_features) {
    if (fidx >= num_sample_col) {
      continue;
    }
Guolin Ke's avatar
Guolin Ke committed
280
281
282
    auto ret = FixSampleIndices(
        bin_mappers[fidx].get(), static_cast<int>(total_sample_cnt),
        num_per_col[fidx], sample_indices[fidx], sample_values[fidx]);
Guolin Ke's avatar
Guolin Ke committed
283
284
285
286
287
288
289
290
    if (!ret.empty()) {
      tmp_indices.push_back(ret);
      tmp_num_per_col[fidx] = static_cast<int>(ret.size());
      sample_indices[fidx] = tmp_indices.back().data();
    } else {
      tmp_num_per_col[fidx] = num_per_col[fidx];
    }
  }
291
  std::vector<int8_t> group_is_multi_val, group_is_multi_val2;
Guolin Ke's avatar
Guolin Ke committed
292
293
294
295
296
297
298
299
  auto features_in_group =
      FindGroups(bin_mappers, used_features, sample_indices,
                 tmp_num_per_col.data(), num_sample_col, total_sample_cnt,
                 num_data, is_use_gpu, is_sparse, &group_is_multi_val);
  auto group2 =
      FindGroups(bin_mappers, feature_order_by_cnt, sample_indices,
                 tmp_num_per_col.data(), num_sample_col, total_sample_cnt,
                 num_data, is_use_gpu, is_sparse, &group_is_multi_val2);
300

Guolin Ke's avatar
Guolin Ke committed
301
302
  if (features_in_group.size() > group2.size()) {
    features_in_group = group2;
303
    group_is_multi_val = group_is_multi_val2;
Guolin Ke's avatar
Guolin Ke committed
304
305
  }
  // shuffle groups
306
307
  int num_group = static_cast<int>(features_in_group.size());
  Random tmp_rand(num_data);
Guolin Ke's avatar
Guolin Ke committed
308
309
  for (int i = 0; i < num_group - 1; ++i) {
    int j = tmp_rand.NextShort(i + 1, num_group);
310
    std::swap(features_in_group[i], features_in_group[j]);
311
    // Using std::swap for vector<bool> will cause the wrong result.
312
    std::swap(group_is_multi_val[i], group_is_multi_val[j]);
Guolin Ke's avatar
Guolin Ke committed
313
  }
314
315
  *multi_val_group = group_is_multi_val;
  return features_in_group;
Guolin Ke's avatar
Guolin Ke committed
316
317
}

Guolin Ke's avatar
Guolin Ke committed
318
319
320
321
322
323
void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
                        int num_total_features,
                        const std::vector<std::vector<double>>& forced_bins,
                        int** sample_non_zero_indices, double** sample_values,
                        const int* num_per_col, int num_sample_col,
                        size_t total_sample_cnt, const Config& io_config) {
324
  num_total_features_ = num_total_features;
Nikita Titov's avatar
Nikita Titov committed
325
  CHECK_EQ(num_total_features_, static_cast<int>(bin_mappers->size()));
Guolin Ke's avatar
Guolin Ke committed
326
327
  // get num_features
  std::vector<int> used_features;
Guolin Ke's avatar
Guolin Ke committed
328
  auto& ref_bin_mappers = *bin_mappers;
Guolin Ke's avatar
Guolin Ke committed
329
  for (int i = 0; i < static_cast<int>(bin_mappers->size()); ++i) {
Guolin Ke's avatar
Guolin Ke committed
330
    if (ref_bin_mappers[i] != nullptr && !ref_bin_mappers[i]->is_trivial()) {
Guolin Ke's avatar
Guolin Ke committed
331
      used_features.emplace_back(i);
Guolin Ke's avatar
Guolin Ke committed
332
    }
Guolin Ke's avatar
Guolin Ke committed
333
  }
Guolin Ke's avatar
Guolin Ke committed
334
  if (used_features.empty()) {
Guolin Ke's avatar
Guolin Ke committed
335
336
337
    Log::Warning(
        "There are no meaningful features, as all feature values are "
        "constant.");
Guolin Ke's avatar
Guolin Ke committed
338
  }
Guolin Ke's avatar
Guolin Ke committed
339
  auto features_in_group = NoGroup(used_features);
340
341
342
343
344
345
346
347
348
349

  auto is_sparse = io_config.is_enable_sparse;
  if (io_config.device_type == std::string("cuda")) {
      LGBM_config_::current_device = lgbm_device_cuda;
      if (is_sparse) {
        Log::Warning("Using sparse features with CUDA is currently not supported.");
      }
      is_sparse = false;
  }

350
  std::vector<int8_t> group_is_multi_val(used_features.size(), 0);
351
  if (io_config.enable_bundle && !used_features.empty()) {
352
    bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda");
Guolin Ke's avatar
Guolin Ke committed
353
354
355
    features_in_group = FastFeatureBundling(
        *bin_mappers, sample_non_zero_indices, sample_values, num_per_col,
        num_sample_col, static_cast<data_size_t>(total_sample_cnt),
356
357
        used_features, num_data_, lgbm_is_gpu_used,
        is_sparse, &group_is_multi_val);
Guolin Ke's avatar
Guolin Ke committed
358
359
  }

Guolin Ke's avatar
Guolin Ke committed
360
361
362
363
364
365
366
367
368
369
  num_features_ = 0;
  for (const auto& fs : features_in_group) {
    num_features_ += static_cast<int>(fs.size());
  }
  int cur_fidx = 0;
  used_feature_map_ = std::vector<int>(num_total_features_, -1);
  num_groups_ = static_cast<int>(features_in_group.size());
  real_feature_idx_.resize(num_features_);
  feature2group_.resize(num_features_);
  feature2subfeature_.resize(num_features_);
Guolin Ke's avatar
Guolin Ke committed
370
  feature_need_push_zeros_.clear();
Guolin Ke's avatar
Guolin Ke committed
371
372
373
374
375
  group_bin_boundaries_.clear();
  uint64_t num_total_bin = 0;
  group_bin_boundaries_.push_back(num_total_bin);
  group_feature_start_.resize(num_groups_);
  group_feature_cnt_.resize(num_groups_);
Guolin Ke's avatar
Guolin Ke committed
376
377
378
  for (int i = 0; i < num_groups_; ++i) {
    auto cur_features = features_in_group[i];
    int cur_cnt_features = static_cast<int>(cur_features.size());
Guolin Ke's avatar
Guolin Ke committed
379
380
    group_feature_start_[i] = cur_fidx;
    group_feature_cnt_[i] = cur_cnt_features;
Guolin Ke's avatar
Guolin Ke committed
381
382
383
384
385
386
387
388
    // get bin_mappers
    std::vector<std::unique_ptr<BinMapper>> cur_bin_mappers;
    for (int j = 0; j < cur_cnt_features; ++j) {
      int real_fidx = cur_features[j];
      used_feature_map_[real_fidx] = cur_fidx;
      real_feature_idx_[cur_fidx] = real_fidx;
      feature2group_[cur_fidx] = i;
      feature2subfeature_[cur_fidx] = j;
Guolin Ke's avatar
Guolin Ke committed
389
      cur_bin_mappers.emplace_back(ref_bin_mappers[real_fidx].release());
Guolin Ke's avatar
Guolin Ke committed
390
391
      if (cur_bin_mappers.back()->GetDefaultBin() !=
          cur_bin_mappers.back()->GetMostFreqBin()) {
Guolin Ke's avatar
Guolin Ke committed
392
393
        feature_need_push_zeros_.push_back(cur_fidx);
      }
Guolin Ke's avatar
Guolin Ke committed
394
395
      ++cur_fidx;
    }
Guolin Ke's avatar
Guolin Ke committed
396
    feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(
397
      new FeatureGroup(cur_cnt_features, group_is_multi_val[i], &cur_bin_mappers, num_data_, i)));
Guolin Ke's avatar
Guolin Ke committed
398
399
400
    num_total_bin += feature_groups_[i]->num_total_bin_;
    group_bin_boundaries_.push_back(num_total_bin);
  }
Belinda Trotta's avatar
Belinda Trotta committed
401
  if (!io_config.max_bin_by_feature.empty()) {
402
403
404
405
    CHECK_EQ(static_cast<size_t>(num_total_features_),
             io_config.max_bin_by_feature.size());
    CHECK_GT(*(std::min_element(io_config.max_bin_by_feature.begin(),
                                io_config.max_bin_by_feature.end())), 1);
Belinda Trotta's avatar
Belinda Trotta committed
406
    max_bin_by_feature_.resize(num_total_features_);
Guolin Ke's avatar
Guolin Ke committed
407
408
    max_bin_by_feature_.assign(io_config.max_bin_by_feature.begin(),
                               io_config.max_bin_by_feature.end());
Belinda Trotta's avatar
Belinda Trotta committed
409
  }
410
  forced_bin_bounds_ = forced_bins;
411
412
413
414
415
  max_bin_ = io_config.max_bin;
  min_data_in_bin_ = io_config.min_data_in_bin;
  bin_construct_sample_cnt_ = io_config.bin_construct_sample_cnt;
  use_missing_ = io_config.use_missing;
  zero_as_missing_ = io_config.zero_as_missing;
416
417
418
419
420
421
422
423
424
425
426
427
  has_raw_ = false;
  if (io_config.linear_tree) {
    has_raw_ = true;
  }
  numeric_feature_map_ = std::vector<int>(num_features_, -1);
  num_numeric_features_ = 0;
  for (int i = 0; i < num_features_; ++i) {
    if (FeatureBinMapper(i)->bin_type() == BinType::NumericalBin) {
      numeric_feature_map_[i] = num_numeric_features_;
      ++num_numeric_features_;
    }
  }
428
429
}

Guolin Ke's avatar
Guolin Ke committed
430
void Dataset::FinishLoad() {
Guolin Ke's avatar
Guolin Ke committed
431
432
433
  if (is_finish_load_) {
    return;
  }
434
435
  if (num_groups_ > 0) {
    for (int i = 0; i < num_groups_; ++i) {
436
      feature_groups_[i]->FinishLoad();
437
    }
Guolin Ke's avatar
Guolin Ke committed
438
  }
Guolin Ke's avatar
Guolin Ke committed
439
  is_finish_load_ = true;
Guolin Ke's avatar
Guolin Ke committed
440
}
Guolin Ke's avatar
Guolin Ke committed
441

442
void PushDataToMultiValBin(
Guolin Ke's avatar
Guolin Ke committed
443
    data_size_t num_data, const std::vector<uint32_t> most_freq_bins,
444
    const std::vector<uint32_t> offsets,
Guolin Ke's avatar
Guolin Ke committed
445
    std::vector<std::vector<std::unique_ptr<BinIterator>>>* iters,
446
447
448
    MultiValBin* ret) {
  Common::FunctionTimer fun_time("Dataset::PushDataToMultiValBin",
                                 global_timer);
449
  if (ret->IsSparse()) {
Guolin Ke's avatar
Guolin Ke committed
450
451
452
453
454
    Threading::For<data_size_t>(
        0, num_data, 1024, [&](int tid, data_size_t start, data_size_t end) {
          std::vector<uint32_t> cur_data;
          cur_data.reserve(most_freq_bins.size());
          for (size_t j = 0; j < most_freq_bins.size(); ++j) {
Guolin Ke's avatar
Guolin Ke committed
455
            (*iters)[tid][j]->Reset(start);
456
          }
Guolin Ke's avatar
Guolin Ke committed
457
458
459
          for (data_size_t i = start; i < end; ++i) {
            cur_data.clear();
            for (size_t j = 0; j < most_freq_bins.size(); ++j) {
460
              // for sparse multi value bin, we store the feature bin values with offset added
Guolin Ke's avatar
Guolin Ke committed
461
              auto cur_bin = (*iters)[tid][j]->Get(i);
Guolin Ke's avatar
Guolin Ke committed
462
463
464
465
466
467
468
469
470
471
              if (cur_bin == most_freq_bins[j]) {
                continue;
              }
              cur_bin += offsets[j];
              if (most_freq_bins[j] == 0) {
                cur_bin -= 1;
              }
              cur_data.push_back(cur_bin);
            }
            ret->PushOneRow(tid, i, cur_data);
472
          }
Guolin Ke's avatar
Guolin Ke committed
473
        });
474
  } else {
Guolin Ke's avatar
Guolin Ke committed
475
476
477
478
    Threading::For<data_size_t>(
        0, num_data, 1024, [&](int tid, data_size_t start, data_size_t end) {
          std::vector<uint32_t> cur_data(most_freq_bins.size(), 0);
          for (size_t j = 0; j < most_freq_bins.size(); ++j) {
Guolin Ke's avatar
Guolin Ke committed
479
            (*iters)[tid][j]->Reset(start);
Guolin Ke's avatar
Guolin Ke committed
480
481
482
          }
          for (data_size_t i = start; i < end; ++i) {
            for (size_t j = 0; j < most_freq_bins.size(); ++j) {
483
              // for dense multi value bin, the feature bin values without offsets are used
Guolin Ke's avatar
Guolin Ke committed
484
              auto cur_bin = (*iters)[tid][j]->Get(i);
Guolin Ke's avatar
Guolin Ke committed
485
              cur_data[j] = cur_bin;
486
            }
Guolin Ke's avatar
Guolin Ke committed
487
            ret->PushOneRow(tid, i, cur_data);
488
          }
Guolin Ke's avatar
Guolin Ke committed
489
        });
490
491
492
  }
}

493
MultiValBin* Dataset::GetMultiBinFromSparseFeatures(const std::vector<uint32_t>& offsets) const {
494
495
  Common::FunctionTimer fun_time("Dataset::GetMultiBinFromSparseFeatures",
                                 global_timer);
496
497
498
499
500
501
502
503
504
505
506
507
508
509
  int multi_group_id = -1;
  for (int i = 0; i < num_groups_; ++i) {
    if (feature_groups_[i]->is_multi_val_) {
      if (multi_group_id < 0) {
        multi_group_id = i;
      } else {
        Log::Fatal("Bug. There should be only one multi-val group.");
      }
    }
  }
  if (multi_group_id < 0) {
    return nullptr;
  }
  const int num_feature = feature_groups_[multi_group_id]->num_feature_;
510
  int num_threads = OMP_NUM_THREADS();
511
512
513
514
515

  std::vector<std::vector<std::unique_ptr<BinIterator>>> iters(num_threads);
  std::vector<uint32_t> most_freq_bins;
  double sum_sparse_rate = 0;
  for (int i = 0; i < num_feature; ++i) {
516
#pragma omp parallel for schedule(static, 1)
517
    for (int tid = 0; tid < num_threads; ++tid) {
518
519
      iters[tid].emplace_back(
          feature_groups_[multi_group_id]->SubFeatureIterator(i));
520
    }
521
522
523
524
    most_freq_bins.push_back(
        feature_groups_[multi_group_id]->bin_mappers_[i]->GetMostFreqBin());
    sum_sparse_rate +=
        feature_groups_[multi_group_id]->bin_mappers_[i]->sparse_rate();
525
526
  }
  sum_sparse_rate /= num_feature;
527
528
  Log::Debug("Dataset::GetMultiBinFromSparseFeatures: sparse rate %f",
             sum_sparse_rate);
529
  std::unique_ptr<MultiValBin> ret;
530
  ret.reset(MultiValBin::CreateMultiValBin(num_data_, offsets.back(),
531
                                           num_feature, sum_sparse_rate, offsets));
Guolin Ke's avatar
Guolin Ke committed
532
  PushDataToMultiValBin(num_data_, most_freq_bins, offsets, &iters, ret.get());
533
534
535
536
  ret->FinishLoad();
  return ret.release();
}

537
MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector<uint32_t>& offsets) const {
538
539
  Common::FunctionTimer fun_time("Dataset::GetMultiBinFromAllFeatures",
                                 global_timer);
540
  int num_threads = OMP_NUM_THREADS();
541
542
543
544
545
  double sum_dense_ratio = 0;

  std::unique_ptr<MultiValBin> ret;
  std::vector<std::vector<std::unique_ptr<BinIterator>>> iters(num_threads);
  std::vector<uint32_t> most_freq_bins;
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
  int ncol = 0;
  for (int gid = 0; gid < num_groups_; ++gid) {
    if (feature_groups_[gid]->is_multi_val_) {
      ncol += feature_groups_[gid]->num_feature_;
    } else {
      ++ncol;
    }
    for (int fid = 0; fid < feature_groups_[gid]->num_feature_; ++fid) {
      const auto& bin_mapper = feature_groups_[gid]->bin_mappers_[fid];
      sum_dense_ratio += 1.0f - bin_mapper->sparse_rate();
    }
  }
  sum_dense_ratio /= ncol;
  const int offset = (1.0f - sum_dense_ratio) >=
    MultiValBin::multi_val_bin_sparse_threshold ? 1 : 0;
  int num_total_bin = offset;
562
563
564
565
566
567
568
  for (int gid = 0; gid < num_groups_; ++gid) {
    if (feature_groups_[gid]->is_multi_val_) {
      for (int fid = 0; fid < feature_groups_[gid]->num_feature_; ++fid) {
        const auto& bin_mapper = feature_groups_[gid]->bin_mappers_[fid];
        most_freq_bins.push_back(bin_mapper->GetMostFreqBin());
        num_total_bin += bin_mapper->num_bin();
        if (most_freq_bins.back() == 0) {
569
          num_total_bin -= offset;
570
        }
571
#pragma omp parallel for schedule(static, 1)
572
        for (int tid = 0; tid < num_threads; ++tid) {
573
574
          iters[tid].emplace_back(
              feature_groups_[gid]->SubFeatureIterator(fid));
575
576
577
578
        }
      }
    } else {
      most_freq_bins.push_back(0);
579
      num_total_bin += feature_groups_[gid]->bin_offsets_.back() - offset;
580
581
582
583
584
      for (int tid = 0; tid < num_threads; ++tid) {
        iters[tid].emplace_back(feature_groups_[gid]->FeatureGroupIterator());
      }
    }
  }
585
  CHECK(static_cast<int>(most_freq_bins.size()) == ncol);
586
587
588
589
  Log::Debug("Dataset::GetMultiBinFromAllFeatures: sparse rate %f",
             1.0 - sum_dense_ratio);
  ret.reset(MultiValBin::CreateMultiValBin(
      num_data_, num_total_bin, static_cast<int>(most_freq_bins.size()),
590
      1.0 - sum_dense_ratio, offsets));
Guolin Ke's avatar
Guolin Ke committed
591
  PushDataToMultiValBin(num_data_, most_freq_bins, offsets, &iters, ret.get());
592
593
594
595
  ret->FinishLoad();
  return ret.release();
}

596
TrainingShareStates* Dataset::GetShareStates(
597
598
    score_t* gradients, score_t* hessians,
    const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
599
    bool force_col_wise, bool force_row_wise) const {
600
601
  Common::FunctionTimer fun_timer("Dataset::TestMultiThreadingMethod",
                                  global_timer);
602
  if (force_col_wise && force_row_wise) {
603
    Log::Fatal(
604
        "Cannot set both of `force_col_wise` and `force_row_wise` to `true` at "
605
        "the same time");
606
607
  }
  if (num_groups_ <= 0) {
608
    TrainingShareStates* share_state = new TrainingShareStates();
609
    share_state->is_col_wise = true;
610
611
    share_state->is_constant_hessian = is_constant_hessian;
    return share_state;
612
  }
613
  if (force_col_wise) {
614
    TrainingShareStates* share_state = new TrainingShareStates();
615
616
617
618
619
620
    std::vector<uint32_t> offsets;
    share_state->CalcBinOffsets(
      feature_groups_, &offsets, true);
    share_state->SetMultiValBin(GetMultiBinFromSparseFeatures(offsets),
      num_data_, feature_groups_, false, true);
    share_state->is_col_wise = true;
621
622
    share_state->is_constant_hessian = is_constant_hessian;
    return share_state;
623
  } else if (force_row_wise) {
624
    TrainingShareStates* share_state = new TrainingShareStates();
625
626
627
628
629
630
    std::vector<uint32_t> offsets;
    share_state->CalcBinOffsets(
      feature_groups_, &offsets, false);
    share_state->SetMultiValBin(GetMultiBinFromAllFeatures(offsets), num_data_,
      feature_groups_, false, false);
    share_state->is_col_wise = false;
631
632
    share_state->is_constant_hessian = is_constant_hessian;
    return share_state;
633
634
635
  } else {
    std::unique_ptr<MultiValBin> sparse_bin;
    std::unique_ptr<MultiValBin> all_bin;
636
637
638
639
    std::unique_ptr<TrainingShareStates> col_wise_state;
    std::unique_ptr<TrainingShareStates> row_wise_state;
    col_wise_state.reset(new TrainingShareStates());
    row_wise_state.reset(new TrainingShareStates());
640

641
    std::chrono::duration<double, std::milli> col_wise_init_time, row_wise_init_time;
642
    auto start_time = std::chrono::steady_clock::now();
643
644
645
646
    std::vector<uint32_t> col_wise_offsets;
    col_wise_state->CalcBinOffsets(feature_groups_, &col_wise_offsets, true);
    col_wise_state->SetMultiValBin(GetMultiBinFromSparseFeatures(col_wise_offsets), num_data_,
      feature_groups_, false, true);
647
    col_wise_init_time = std::chrono::steady_clock::now() - start_time;
648

649
    start_time = std::chrono::steady_clock::now();
650
651
652
653
654
655
656
657
    std::vector<uint32_t> row_wise_offsets;
    row_wise_state->CalcBinOffsets(feature_groups_, &row_wise_offsets, false);
    row_wise_state->SetMultiValBin(GetMultiBinFromAllFeatures(row_wise_offsets), num_data_,
      feature_groups_, false, false);
    row_wise_init_time = std::chrono::steady_clock::now() - start_time;

    uint64_t max_total_bin = std::max<uint64_t>(row_wise_state->num_hist_total_bin(),
      col_wise_state->num_hist_total_bin());
658
    std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>
659
        hist_data(max_total_bin * 2);
660
661

    Log::Debug(
662
663
664
665
666
667
668
669
670
      "init for col-wise cost %f seconds, init for row-wise cost %f seconds",
      col_wise_init_time * 1e-3, row_wise_init_time * 1e-3);

    col_wise_state->is_col_wise = true;
    col_wise_state->is_constant_hessian = is_constant_hessian;
    InitTrain(is_feature_used, col_wise_state.get());
    row_wise_state->is_col_wise = false;
    row_wise_state->is_constant_hessian = is_constant_hessian;
    InitTrain(is_feature_used, row_wise_state.get());
671
    std::chrono::duration<double, std::milli> col_wise_time, row_wise_time;
672
    start_time = std::chrono::steady_clock::now();
673
    ConstructHistograms(is_feature_used, nullptr, num_data_, gradients,
674
                        hessians, gradients, hessians, col_wise_state.get(),
675
                        hist_data.data());
676
677
    col_wise_time = std::chrono::steady_clock::now() - start_time;
    start_time = std::chrono::steady_clock::now();
Guolin Ke's avatar
Guolin Ke committed
678
    ConstructHistograms(is_feature_used, nullptr, num_data_, gradients,
679
                        hessians, gradients, hessians, row_wise_state.get(),
Guolin Ke's avatar
Guolin Ke committed
680
                        hist_data.data());
681
    row_wise_time = std::chrono::steady_clock::now() - start_time;
682

683
    if (col_wise_time < row_wise_time) {
684
685
      auto overhead_cost = row_wise_init_time + row_wise_time + col_wise_time;
      Log::Warning(
686
          "Auto-choosing col-wise multi-threading, the overhead of testing was "
Nikita Titov's avatar
Nikita Titov committed
687
688
          "%f seconds.\n"
          "You can set `force_col_wise=true` to remove the overhead.",
689
          overhead_cost * 1e-3);
690
      return col_wise_state.release();
691
    } else {
692
693
      auto overhead_cost = col_wise_init_time + row_wise_time + col_wise_time;
      Log::Warning(
694
          "Auto-choosing row-wise multi-threading, the overhead of testing was "
Nikita Titov's avatar
Nikita Titov committed
695
696
697
          "%f seconds.\n"
          "You can set `force_row_wise=true` to remove the overhead.\n"
          "And if memory is not enough, you can set `force_col_wise=true`.",
698
          overhead_cost * 1e-3);
699
      if (row_wise_state->IsSparseRowwise()) {
700
        Log::Debug("Using Sparse Multi-Val Bin");
701
      } else {
702
        Log::Debug("Using Dense Multi-Val Bin");
703
      }
704
      return row_wise_state.release();
705
706
707
708
    }
  }
}

709
void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
Guolin Ke's avatar
Guolin Ke committed
710
  feature_groups_.clear();
Guolin Ke's avatar
Guolin Ke committed
711
  num_features_ = dataset->num_features_;
Guolin Ke's avatar
Guolin Ke committed
712
  num_groups_ = dataset->num_groups_;
713
  has_raw_ = dataset->has_raw();
Guolin Ke's avatar
Guolin Ke committed
714
  // copy feature bin mapper data
Guolin Ke's avatar
Guolin Ke committed
715
  for (int i = 0; i < num_groups_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
716
717
    feature_groups_.emplace_back(
        new FeatureGroup(*dataset->feature_groups_[i], num_data_));
Guolin Ke's avatar
Guolin Ke committed
718
  }
Guolin Ke's avatar
Guolin Ke committed
719
  feature_groups_.shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
720
721
722
  used_feature_map_ = dataset->used_feature_map_;
  num_total_features_ = dataset->num_total_features_;
  feature_names_ = dataset->feature_names_;
Guolin Ke's avatar
Guolin Ke committed
723
  label_idx_ = dataset->label_idx_;
Guolin Ke's avatar
Guolin Ke committed
724
725
726
727
728
729
  real_feature_idx_ = dataset->real_feature_idx_;
  feature2group_ = dataset->feature2group_;
  feature2subfeature_ = dataset->feature2subfeature_;
  group_bin_boundaries_ = dataset->group_bin_boundaries_;
  group_feature_start_ = dataset->group_feature_start_;
  group_feature_cnt_ = dataset->group_feature_cnt_;
730
  forced_bin_bounds_ = dataset->forced_bin_bounds_;
Guolin Ke's avatar
Guolin Ke committed
731
  feature_need_push_zeros_ = dataset->feature_need_push_zeros_;
Guolin Ke's avatar
Guolin Ke committed
732
733
734
735
736
737
738
739
}

void Dataset::CreateValid(const Dataset* dataset) {
  feature_groups_.clear();
  num_features_ = dataset->num_features_;
  num_groups_ = num_features_;
  feature2group_.clear();
  feature2subfeature_.clear();
740
741
742
743
  has_raw_ = dataset->has_raw();
  numeric_feature_map_ = dataset->numeric_feature_map_;
  num_numeric_features_ = dataset->num_numeric_features_;
  // copy feature bin mapper data
Guolin Ke's avatar
Guolin Ke committed
744
  feature_need_push_zeros_.clear();
Guolin Ke's avatar
Guolin Ke committed
745
746
747
748
749
  group_bin_boundaries_.clear();
  uint64_t num_total_bin = 0;
  group_bin_boundaries_.push_back(num_total_bin);
  group_feature_start_.resize(num_groups_);
  group_feature_cnt_.resize(num_groups_);
Guolin Ke's avatar
Guolin Ke committed
750
751
752
  for (int i = 0; i < num_features_; ++i) {
    std::vector<std::unique_ptr<BinMapper>> bin_mappers;
    bin_mappers.emplace_back(new BinMapper(*(dataset->FeatureBinMapper(i))));
Guolin Ke's avatar
Guolin Ke committed
753
754
    if (bin_mappers.back()->GetDefaultBin() !=
        bin_mappers.back()->GetMostFreqBin()) {
Guolin Ke's avatar
Guolin Ke committed
755
756
      feature_need_push_zeros_.push_back(i);
    }
757
    feature_groups_.emplace_back(new FeatureGroup(&bin_mappers, num_data_));
Guolin Ke's avatar
Guolin Ke committed
758
759
    feature2group_.push_back(i);
    feature2subfeature_.push_back(0);
Guolin Ke's avatar
Guolin Ke committed
760
761
762
763
    num_total_bin += feature_groups_[i]->num_total_bin_;
    group_bin_boundaries_.push_back(num_total_bin);
    group_feature_start_[i] = i;
    group_feature_cnt_[i] = 1;
Guolin Ke's avatar
Guolin Ke committed
764
765
766
767
768
769
770
771
  }

  feature_groups_.shrink_to_fit();
  used_feature_map_ = dataset->used_feature_map_;
  num_total_features_ = dataset->num_total_features_;
  feature_names_ = dataset->feature_names_;
  label_idx_ = dataset->label_idx_;
  real_feature_idx_ = dataset->real_feature_idx_;
772
  forced_bin_bounds_ = dataset->forced_bin_bounds_;
Guolin Ke's avatar
Guolin Ke committed
773
774
}

Guolin Ke's avatar
Guolin Ke committed
775
776
777
void Dataset::ReSize(data_size_t num_data) {
  if (num_data_ != num_data) {
    num_data_ = num_data;
778
    OMP_INIT_EX();
Guolin Ke's avatar
Guolin Ke committed
779
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
780
    for (int group = 0; group < num_groups_; ++group) {
781
      OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
782
      feature_groups_[group]->ReSize(num_data_);
783
      OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
784
    }
785
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
786
787
788
  }
}

789
void Dataset::CopySubrow(const Dataset* fullset,
Guolin Ke's avatar
Guolin Ke committed
790
791
                         const data_size_t* used_indices,
                         data_size_t num_used_indices, bool need_meta_data) {
Nikita Titov's avatar
Nikita Titov committed
792
  CHECK_EQ(num_used_indices, num_data_);
793
  OMP_INIT_EX();
Guolin Ke's avatar
Guolin Ke committed
794
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
795
  for (int group = 0; group < num_groups_; ++group) {
796
    OMP_LOOP_EX_BEGIN();
797
    feature_groups_[group]->CopySubrow(fullset->feature_groups_[group].get(),
Guolin Ke's avatar
Guolin Ke committed
798
                                       used_indices, num_used_indices);
799
    OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
800
  }
801
  OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
802
  if (need_meta_data) {
Guolin Ke's avatar
Guolin Ke committed
803
    metadata_.Init(fullset->metadata_, used_indices, num_used_indices);
Guolin Ke's avatar
Guolin Ke committed
804
  }
Guolin Ke's avatar
Guolin Ke committed
805
  is_finish_load_ = true;
806
807
808
809
810
811
812
813
814
815
816
  numeric_feature_map_ = fullset->numeric_feature_map_;
  num_numeric_features_ = fullset->num_numeric_features_;
  if (has_raw_) {
    ResizeRaw(num_used_indices);
#pragma omp parallel for schedule(static)
    for (int i = 0; i < num_used_indices; ++i) {
      for (int j = 0; j < num_numeric_features_; ++j) {
        raw_data_[j][i] = fullset->raw_data_[j][used_indices[i]];
      }
    }
  }
Guolin Ke's avatar
Guolin Ke committed
817
818
}

Guolin Ke's avatar
Guolin Ke committed
819
820
bool Dataset::SetFloatField(const char* field_name, const float* field_data,
                            data_size_t num_element) {
Guolin Ke's avatar
Guolin Ke committed
821
822
823
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("label") || name == std::string("target")) {
Guolin Ke's avatar
Guolin Ke committed
824
#ifdef LABEL_T_USE_DOUBLE
825
    Log::Fatal("Don't support LABEL_T_USE_DOUBLE");
Guolin Ke's avatar
Guolin Ke committed
826
#else
827
    metadata_.SetLabel(field_data, num_element);
Guolin Ke's avatar
Guolin Ke committed
828
#endif
Guolin Ke's avatar
Guolin Ke committed
829
  } else if (name == std::string("weight") || name == std::string("weights")) {
Guolin Ke's avatar
Guolin Ke committed
830
#ifdef LABEL_T_USE_DOUBLE
831
    Log::Fatal("Don't support LABEL_T_USE_DOUBLE");
Guolin Ke's avatar
Guolin Ke committed
832
#else
833
    metadata_.SetWeights(field_data, num_element);
Guolin Ke's avatar
Guolin Ke committed
834
#endif
Guolin Ke's avatar
Guolin Ke committed
835
836
837
838
839
840
  } else {
    return false;
  }
  return true;
}

Guolin Ke's avatar
Guolin Ke committed
841
842
bool Dataset::SetDoubleField(const char* field_name, const double* field_data,
                             data_size_t num_element) {
Guolin Ke's avatar
Guolin Ke committed
843
844
845
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("init_score")) {
846
    metadata_.SetInitScore(field_data, num_element);
Guolin Ke's avatar
Guolin Ke committed
847
  } else {
848
    return false;
Guolin Ke's avatar
Guolin Ke committed
849
  }
850
  return true;
Guolin Ke's avatar
Guolin Ke committed
851
852
}

Guolin Ke's avatar
Guolin Ke committed
853
854
bool Dataset::SetIntField(const char* field_name, const int* field_data,
                          data_size_t num_element) {
855
856
857
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("query") || name == std::string("group")) {
Guolin Ke's avatar
Guolin Ke committed
858
    metadata_.SetQuery(field_data, num_element);
859
860
861
862
863
864
  } else {
    return false;
  }
  return true;
}

Guolin Ke's avatar
Guolin Ke committed
865
866
bool Dataset::GetFloatField(const char* field_name, data_size_t* out_len,
                            const float** out_ptr) {
867
868
869
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("label") || name == std::string("target")) {
Guolin Ke's avatar
Guolin Ke committed
870
#ifdef LABEL_T_USE_DOUBLE
871
    Log::Fatal("Don't support LABEL_T_USE_DOUBLE");
Guolin Ke's avatar
Guolin Ke committed
872
#else
873
874
    *out_ptr = metadata_.label();
    *out_len = num_data_;
Guolin Ke's avatar
Guolin Ke committed
875
#endif
876
  } else if (name == std::string("weight") || name == std::string("weights")) {
Guolin Ke's avatar
Guolin Ke committed
877
#ifdef LABEL_T_USE_DOUBLE
878
    Log::Fatal("Don't support LABEL_T_USE_DOUBLE");
Guolin Ke's avatar
Guolin Ke committed
879
#else
880
881
    *out_ptr = metadata_.weights();
    *out_len = num_data_;
Guolin Ke's avatar
Guolin Ke committed
882
#endif
Guolin Ke's avatar
Guolin Ke committed
883
884
885
886
887
888
  } else {
    return false;
  }
  return true;
}

Guolin Ke's avatar
Guolin Ke committed
889
890
bool Dataset::GetDoubleField(const char* field_name, data_size_t* out_len,
                             const double** out_ptr) {
Guolin Ke's avatar
Guolin Ke committed
891
892
893
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("init_score")) {
894
    *out_ptr = metadata_.init_score();
Guolin Ke's avatar
Guolin Ke committed
895
    *out_len = static_cast<data_size_t>(metadata_.num_init_score());
896
  } else {
897
898
    return false;
  }
899
  return true;
900
901
}

Guolin Ke's avatar
Guolin Ke committed
902
903
bool Dataset::GetIntField(const char* field_name, data_size_t* out_len,
                          const int** out_ptr) {
904
905
906
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("query") || name == std::string("group")) {
907
    *out_ptr = metadata_.query_boundaries();
Guolin Ke's avatar
Guolin Ke committed
908
    *out_len = metadata_.num_queries() + 1;
Guolin Ke's avatar
Guolin Ke committed
909
910
911
  } else {
    return false;
  }
912
  return true;
913
914
}

Guolin Ke's avatar
Guolin Ke committed
915
void Dataset::SaveBinaryFile(const char* bin_filename) {
Guolin Ke's avatar
Guolin Ke committed
916
  if (bin_filename != nullptr && std::string(bin_filename) == data_filename_) {
917
    Log::Warning("Bianry file %s already exists", bin_filename);
Guolin Ke's avatar
Guolin Ke committed
918
919
    return;
  }
Guolin Ke's avatar
Guolin Ke committed
920
  // if not pass a filename, just append ".bin" of original file
Guolin Ke's avatar
Guolin Ke committed
921
  std::string bin_filename_str(data_filename_);
Guolin Ke's avatar
Guolin Ke committed
922
923
924
925
  if (bin_filename == nullptr || bin_filename[0] == '\0') {
    bin_filename_str.append(".bin");
    bin_filename = bin_filename_str.c_str();
  }
Guolin Ke's avatar
Guolin Ke committed
926
  bool is_file_existed = false;
927
928

  if (VirtualFileWriter::Exists(bin_filename)) {
Guolin Ke's avatar
Guolin Ke committed
929
    is_file_existed = true;
930
    Log::Warning("File %s exists, cannot save binary to it", bin_filename);
Guolin Ke's avatar
Guolin Ke committed
931
  }
Guolin Ke's avatar
Guolin Ke committed
932

Guolin Ke's avatar
Guolin Ke committed
933
  if (!is_file_existed) {
934
935
    auto writer = VirtualFileWriter::Make(bin_filename);
    if (!writer->Init()) {
Guolin Ke's avatar
Guolin Ke committed
936
      Log::Fatal("Cannot write binary data to %s ", bin_filename);
Guolin Ke's avatar
Guolin Ke committed
937
    }
938
    Log::Info("Saving data to binary file %s", bin_filename);
939
    size_t size_of_token = std::strlen(binary_file_token);
940
    writer->AlignedWrite(binary_file_token, size_of_token);
Guolin Ke's avatar
Guolin Ke committed
941
    // get size of header
942
943
944
945
946
947
948
949
950
951
952
953
    size_t size_of_header =
        VirtualFileWriter::AlignedSize(sizeof(num_data_)) +
        VirtualFileWriter::AlignedSize(sizeof(num_features_)) +
        VirtualFileWriter::AlignedSize(sizeof(num_total_features_)) +
        VirtualFileWriter::AlignedSize(sizeof(int) * num_total_features_) +
        VirtualFileWriter::AlignedSize(sizeof(label_idx_)) +
        VirtualFileWriter::AlignedSize(sizeof(num_groups_)) +
        3 * VirtualFileWriter::AlignedSize(sizeof(int) * num_features_) +
        sizeof(uint64_t) * (num_groups_ + 1) +
        2 * VirtualFileWriter::AlignedSize(sizeof(int) * num_groups_) +
        VirtualFileWriter::AlignedSize(sizeof(int32_t) * num_total_features_) +
        VirtualFileWriter::AlignedSize(sizeof(int)) * 3 +
954
        VirtualFileWriter::AlignedSize(sizeof(bool)) * 3;
955
956
    // size of feature names
    for (int i = 0; i < num_total_features_; ++i) {
957
958
959
      size_of_header +=
          VirtualFileWriter::AlignedSize(feature_names_[i].size()) +
          VirtualFileWriter::AlignedSize(sizeof(int));
960
    }
961
962
    // size of forced bins
    for (int i = 0; i < num_total_features_; ++i) {
963
964
      size_of_header += forced_bin_bounds_[i].size() * sizeof(double) +
                        VirtualFileWriter::AlignedSize(sizeof(int));
965
    }
966
    writer->Write(&size_of_header, sizeof(size_of_header));
Guolin Ke's avatar
Guolin Ke committed
967
    // write header
968
969
970
971
972
973
974
975
976
977
    writer->AlignedWrite(&num_data_, sizeof(num_data_));
    writer->AlignedWrite(&num_features_, sizeof(num_features_));
    writer->AlignedWrite(&num_total_features_, sizeof(num_total_features_));
    writer->AlignedWrite(&label_idx_, sizeof(label_idx_));
    writer->AlignedWrite(&max_bin_, sizeof(max_bin_));
    writer->AlignedWrite(&bin_construct_sample_cnt_,
                         sizeof(bin_construct_sample_cnt_));
    writer->AlignedWrite(&min_data_in_bin_, sizeof(min_data_in_bin_));
    writer->AlignedWrite(&use_missing_, sizeof(use_missing_));
    writer->AlignedWrite(&zero_as_missing_, sizeof(zero_as_missing_));
978
    writer->AlignedWrite(&has_raw_, sizeof(has_raw_));
979
980
981
982
983
984
985
    writer->AlignedWrite(used_feature_map_.data(),
                         sizeof(int) * num_total_features_);
    writer->AlignedWrite(&num_groups_, sizeof(num_groups_));
    writer->AlignedWrite(real_feature_idx_.data(), sizeof(int) * num_features_);
    writer->AlignedWrite(feature2group_.data(), sizeof(int) * num_features_);
    writer->AlignedWrite(feature2subfeature_.data(),
                         sizeof(int) * num_features_);
Guolin Ke's avatar
Guolin Ke committed
986
987
    writer->Write(group_bin_boundaries_.data(),
                  sizeof(uint64_t) * (num_groups_ + 1));
988
989
990
    writer->AlignedWrite(group_feature_start_.data(),
                         sizeof(int) * num_groups_);
    writer->AlignedWrite(group_feature_cnt_.data(), sizeof(int) * num_groups_);
Belinda Trotta's avatar
Belinda Trotta committed
991
992
993
    if (max_bin_by_feature_.empty()) {
      ArrayArgs<int32_t>::Assign(&max_bin_by_feature_, -1, num_total_features_);
    }
994
    writer->AlignedWrite(max_bin_by_feature_.data(),
Guolin Ke's avatar
Guolin Ke committed
995
                  sizeof(int32_t) * num_total_features_);
Belinda Trotta's avatar
Belinda Trotta committed
996
997
998
    if (ArrayArgs<int32_t>::CheckAll(max_bin_by_feature_, -1)) {
      max_bin_by_feature_.clear();
    }
999
1000
1001
    // write feature names
    for (int i = 0; i < num_total_features_; ++i) {
      int str_len = static_cast<int>(feature_names_[i].size());
1002
      writer->AlignedWrite(&str_len, sizeof(int));
1003
      const char* c_str = feature_names_[i].c_str();
1004
      writer->AlignedWrite(c_str, sizeof(char) * str_len);
1005
    }
1006
1007
1008
    // write forced bins
    for (int i = 0; i < num_total_features_; ++i) {
      int num_bounds = static_cast<int>(forced_bin_bounds_[i].size());
1009
      writer->AlignedWrite(&num_bounds, sizeof(int));
1010

1011
1012
1013
1014
      for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) {
        writer->Write(&forced_bin_bounds_[i][j], sizeof(double));
      }
    }
1015

Guolin Ke's avatar
Guolin Ke committed
1016
1017
    // get size of meta data
    size_t size_of_metadata = metadata_.SizesInByte();
1018
    writer->Write(&size_of_metadata, sizeof(size_of_metadata));
Guolin Ke's avatar
Guolin Ke committed
1019
    // write meta data
1020
    metadata_.SaveBinaryToFile(writer.get());
Guolin Ke's avatar
Guolin Ke committed
1021
1022

    // write feature data
Guolin Ke's avatar
Guolin Ke committed
1023
    for (int i = 0; i < num_groups_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
1024
      // get size of feature
Guolin Ke's avatar
Guolin Ke committed
1025
      size_t size_of_feature = feature_groups_[i]->SizesInByte();
1026
      writer->Write(&size_of_feature, sizeof(size_of_feature));
Guolin Ke's avatar
Guolin Ke committed
1027
      // write feature
1028
      feature_groups_[i]->SaveBinaryToFile(writer.get());
Guolin Ke's avatar
Guolin Ke committed
1029
    }
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041

    // write raw data; use row-major order so we can read row-by-row
    if (has_raw_) {
      for (int i = 0; i < num_data_; ++i) {
        for (int j = 0; j < num_features_; ++j) {
          int feat_ind = numeric_feature_map_[j];
          if (feat_ind > -1) {
            writer->Write(&raw_data_[feat_ind][i], sizeof(float));
          }
        }
      }
    }
Guolin Ke's avatar
Guolin Ke committed
1042
1043
1044
  }
}

1045
void Dataset::DumpTextFile(const char* text_filename) {
Guolin Ke's avatar
Guolin Ke committed
1046
1047
1048
1049
1050
1051
  FILE* file = NULL;
#if _MSC_VER
  fopen_s(&file, text_filename, "wt");
#else
  file = fopen(text_filename, "wt");
#endif
1052
1053
1054
1055
1056
  fprintf(file, "num_features: %d\n", num_features_);
  fprintf(file, "num_total_features: %d\n", num_total_features_);
  fprintf(file, "num_groups: %d\n", num_groups_);
  fprintf(file, "num_data: %d\n", num_data_);
  fprintf(file, "feature_names: ");
1057
  for (auto n : feature_names_) {
1058
1059
    fprintf(file, "%s, ", n.c_str());
  }
Belinda Trotta's avatar
Belinda Trotta committed
1060
1061
1062
1063
  fprintf(file, "\nmax_bin_by_feature: ");
  for (auto i : max_bin_by_feature_) {
    fprintf(file, "%d, ", i);
  }
1064
  fprintf(file, "\n");
1065
  for (auto n : feature_names_) {
1066
1067
    fprintf(file, "%s, ", n.c_str());
  }
1068
1069
1070
1071
1072
1073
1074
  fprintf(file, "\nforced_bins: ");
  for (int i = 0; i < num_total_features_; ++i) {
    fprintf(file, "\nfeature %d: ", i);
    for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) {
      fprintf(file, "%lf, ", forced_bin_bounds_[i][j]);
    }
  }
1075
1076
  std::vector<std::unique_ptr<BinIterator>> iterators;
  iterators.reserve(num_features_);
1077
  for (int j = 0; j < num_features_; ++j) {
1078
1079
    auto group_idx = feature2group_[j];
    auto sub_idx = feature2subfeature_[j];
Guolin Ke's avatar
Guolin Ke committed
1080
1081
    iterators.emplace_back(
        feature_groups_[group_idx]->SubFeatureIterator(sub_idx));
1082
  }
1083
  for (data_size_t i = 0; i < num_data_; ++i) {
1084
    fprintf(file, "\n");
1085
    for (int j = 0; j < num_total_features_; ++j) {
1086
      auto inner_feature_idx = used_feature_map_[j];
1087
1088
      if (inner_feature_idx < 0) {
        fprintf(file, "NA, ");
1089
      } else {
Guolin Ke's avatar
Guolin Ke committed
1090
        fprintf(file, "%d, ", iterators[inner_feature_idx]->Get(i));
1091
1092
1093
1094
1095
1096
      }
    }
  }
  fclose(file);
}

1097
void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used,
1098
                        TrainingShareStates* share_state) const {
1099
  Common::FunctionTimer fun_time("Dataset::InitTrain", global_timer);
1100
1101
1102
  share_state->InitTrain(group_feature_start_,
        feature_groups_,
        is_feature_used);
1103
1104
}

Guolin Ke's avatar
Guolin Ke committed
1105
template <bool USE_INDICES, bool ORDERED>
1106
1107
1108
1109
1110
1111
void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices,
                                          data_size_t num_data,
                                          const score_t* gradients,
                                          const score_t* hessians,
                                          TrainingShareStates* share_state,
                                          hist_t* hist_data) const {
1112
1113
  Common::FunctionTimer fun_time("Dataset::ConstructHistogramsMultiVal",
                                 global_timer);
1114
1115
  share_state->ConstructHistograms<USE_INDICES, ORDERED>(
      data_indices, num_data, gradients, hessians, hist_data);
1116
1117
}

Guolin Ke's avatar
Guolin Ke committed
1118
1119
template <bool USE_INDICES, bool USE_HESSIAN>
void Dataset::ConstructHistogramsInner(
1120
1121
1122
    const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
    data_size_t num_data, const score_t* gradients, const score_t* hessians,
    score_t* ordered_gradients, score_t* ordered_hessians,
1123
    TrainingShareStates* share_state, hist_t* hist_data) const {
1124
  if (!share_state->is_col_wise) {
Guolin Ke's avatar
Guolin Ke committed
1125
1126
    return ConstructHistogramsMultiVal<USE_INDICES, false>(
        data_indices, num_data, gradients, hessians, share_state, hist_data);
1127
1128
1129
1130
  }
  std::vector<int> used_dense_group;
  int multi_val_groud_id = -1;
  used_dense_group.reserve(num_groups_);
Guolin Ke's avatar
Guolin Ke committed
1131
  for (int group = 0; group < num_groups_; ++group) {
Guolin Ke's avatar
Guolin Ke committed
1132
    const int f_start = group_feature_start_[group];
Guolin Ke's avatar
Guolin Ke committed
1133
    const int f_cnt = group_feature_cnt_[group];
1134
    bool is_group_used = false;
Guolin Ke's avatar
Guolin Ke committed
1135
    for (int j = 0; j < f_cnt; ++j) {
Guolin Ke's avatar
Guolin Ke committed
1136
      const int fidx = f_start + j;
Guolin Ke's avatar
Guolin Ke committed
1137
      if (is_feature_used[fidx]) {
1138
        is_group_used = true;
Guolin Ke's avatar
Guolin Ke committed
1139
1140
1141
        break;
      }
    }
1142
    if (is_group_used) {
1143
1144
1145
1146
      if (feature_groups_[group]->is_multi_val_) {
        multi_val_groud_id = group;
      } else {
        used_dense_group.push_back(group);
1147
      }
Guolin Ke's avatar
Guolin Ke committed
1148
    }
1149
1150
1151
  }
  int num_used_dense_group = static_cast<int>(used_dense_group.size());
  global_timer.Start("Dataset::dense_bin_histogram");
Guolin Ke's avatar
Guolin Ke committed
1152
1153
  auto ptr_ordered_grad = gradients;
  auto ptr_ordered_hess = hessians;
1154
  if (num_used_dense_group > 0) {
Guolin Ke's avatar
Guolin Ke committed
1155
1156
    if (USE_INDICES) {
      if (USE_HESSIAN) {
Guolin Ke's avatar
Guolin Ke committed
1157
#pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
1158
1159
1160
1161
        for (data_size_t i = 0; i < num_data; ++i) {
          ordered_gradients[i] = gradients[data_indices[i]];
          ordered_hessians[i] = hessians[data_indices[i]];
        }
Guolin Ke's avatar
Guolin Ke committed
1162
1163
        ptr_ordered_grad = ordered_gradients;
        ptr_ordered_hess = ordered_hessians;
1164
      } else {
Guolin Ke's avatar
Guolin Ke committed
1165
#pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
1166
1167
        for (data_size_t i = 0; i < num_data; ++i) {
          ordered_gradients[i] = gradients[data_indices[i]];
1168
        }
Guolin Ke's avatar
Guolin Ke committed
1169
        ptr_ordered_grad = ordered_gradients;
1170
      }
Guolin Ke's avatar
Guolin Ke committed
1171
1172
    }
    OMP_INIT_EX();
1173
#pragma omp parallel for schedule(static) num_threads(share_state->num_threads)
Guolin Ke's avatar
Guolin Ke committed
1174
1175
1176
1177
1178
1179
1180
1181
1182
    for (int gi = 0; gi < num_used_dense_group; ++gi) {
      OMP_LOOP_EX_BEGIN();
      int group = used_dense_group[gi];
      auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
      const int num_bin = feature_groups_[group]->num_total_bin_;
      std::memset(reinterpret_cast<void*>(data_ptr), 0,
                  num_bin * kHistEntrySize);
      if (USE_HESSIAN) {
        if (USE_INDICES) {
1183
          feature_groups_[group]->bin_data_->ConstructHistogram(
1184
1185
              data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess,
              data_ptr);
Guolin Ke's avatar
Guolin Ke committed
1186
        } else {
1187
          feature_groups_[group]->bin_data_->ConstructHistogram(
1188
              0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
1189
        }
1190
      } else {
Guolin Ke's avatar
Guolin Ke committed
1191
1192
1193
1194
        if (USE_INDICES) {
          feature_groups_[group]->bin_data_->ConstructHistogram(
              data_indices, 0, num_data, ptr_ordered_grad, data_ptr);
        } else {
1195
1196
          feature_groups_[group]->bin_data_->ConstructHistogram(
              0, num_data, ptr_ordered_grad, data_ptr);
1197
        }
Guolin Ke's avatar
Guolin Ke committed
1198
1199
1200
1201
        auto cnt_dst = reinterpret_cast<hist_cnt_t*>(data_ptr + 1);
        for (int i = 0; i < num_bin * 2; i += 2) {
          data_ptr[i + 1] = static_cast<double>(cnt_dst[i]) * hessians[0];
        }
1202
      }
Guolin Ke's avatar
Guolin Ke committed
1203
      OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
1204
    }
Guolin Ke's avatar
Guolin Ke committed
1205
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
1206
  }
1207
1208
  global_timer.Stop("Dataset::dense_bin_histogram");
  if (multi_val_groud_id >= 0) {
Guolin Ke's avatar
Guolin Ke committed
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
    if (num_used_dense_group > 0) {
      ConstructHistogramsMultiVal<USE_INDICES, true>(
          data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess,
          share_state,
          hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
    } else {
      ConstructHistogramsMultiVal<USE_INDICES, false>(
          data_indices, num_data, gradients, hessians, share_state,
          hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
    }
1219
  }
Guolin Ke's avatar
Guolin Ke committed
1220
1221
}

Guolin Ke's avatar
Guolin Ke committed
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
// explicitly initilize template methods, for cross module call
template void Dataset::ConstructHistogramsInner<true, true>(
    const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
    data_size_t num_data, const score_t* gradients, const score_t* hessians,
    score_t* ordered_gradients, score_t* ordered_hessians,
    TrainingShareStates* share_state, hist_t* hist_data) const;

template void Dataset::ConstructHistogramsInner<true, false>(
    const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
    data_size_t num_data, const score_t* gradients, const score_t* hessians,
    score_t* ordered_gradients, score_t* ordered_hessians,
    TrainingShareStates* share_state, hist_t* hist_data) const;

template void Dataset::ConstructHistogramsInner<false, true>(
    const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
    data_size_t num_data, const score_t* gradients, const score_t* hessians,
    score_t* ordered_gradients, score_t* ordered_hessians,
    TrainingShareStates* share_state, hist_t* hist_data) const;

template void Dataset::ConstructHistogramsInner<false, false>(
    const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
    data_size_t num_data, const score_t* gradients, const score_t* hessians,
    score_t* ordered_gradients, score_t* ordered_hessians,
    TrainingShareStates* share_state, hist_t* hist_data) const;

Guolin Ke's avatar
Guolin Ke committed
1247
1248
void Dataset::FixHistogram(int feature_idx, double sum_gradient,
                           double sum_hessian, hist_t* data) const {
Guolin Ke's avatar
Guolin Ke committed
1249
1250
  const int group = feature2group_[feature_idx];
  const int sub_feature = feature2subfeature_[feature_idx];
Guolin Ke's avatar
Guolin Ke committed
1251
1252
  const BinMapper* bin_mapper =
      feature_groups_[group]->bin_mappers_[sub_feature].get();
Guolin Ke's avatar
Guolin Ke committed
1253
1254
  const int most_freq_bin = bin_mapper->GetMostFreqBin();
  if (most_freq_bin > 0) {
Guolin Ke's avatar
Guolin Ke committed
1255
    const int num_bin = bin_mapper->num_bin();
1256
1257
    GET_GRAD(data, most_freq_bin) = sum_gradient;
    GET_HESS(data, most_freq_bin) = sum_hessian;
Guolin Ke's avatar
Guolin Ke committed
1258
    for (int i = 0; i < num_bin; ++i) {
Guolin Ke's avatar
Guolin Ke committed
1259
      if (i != most_freq_bin) {
1260
1261
        GET_GRAD(data, most_freq_bin) -= GET_GRAD(data, i);
        GET_HESS(data, most_freq_bin) -= GET_HESS(data, i);
Guolin Ke's avatar
Guolin Ke committed
1262
1263
1264
1265
1266
      }
    }
  }
}

Guolin Ke's avatar
Guolin Ke committed
1267
template <typename T>
Guolin Ke's avatar
Guolin Ke committed
1268
1269
void PushVector(std::vector<T>* dest, const std::vector<T>& src) {
  dest->reserve(dest->size() + src.size());
1270
  for (auto i : src) {
Guolin Ke's avatar
Guolin Ke committed
1271
    dest->push_back(i);
1272
1273
1274
  }
}

Guolin Ke's avatar
Guolin Ke committed
1275
1276
1277
template <typename T>
void PushOffset(std::vector<T>* dest, const std::vector<T>& src,
                const T& offset) {
Guolin Ke's avatar
Guolin Ke committed
1278
  dest->reserve(dest->size() + src.size());
1279
  for (auto i : src) {
Guolin Ke's avatar
Guolin Ke committed
1280
    dest->push_back(i + offset);
1281
1282
1283
  }
}

Guolin Ke's avatar
Guolin Ke committed
1284
1285
1286
1287
template <typename T>
void PushClearIfEmpty(std::vector<T>* dest, const size_t dest_len,
                      const std::vector<T>& src, const size_t src_len,
                      const T& deflt) {
Guolin Ke's avatar
Guolin Ke committed
1288
  if (!dest->empty() && !src.empty()) {
1289
    PushVector(dest, src);
Guolin Ke's avatar
Guolin Ke committed
1290
  } else if (!dest->empty() && src.empty()) {
1291
    for (size_t i = 0; i < src_len; ++i) {
Guolin Ke's avatar
Guolin Ke committed
1292
      dest->push_back(deflt);
1293
    }
Guolin Ke's avatar
Guolin Ke committed
1294
  } else if (dest->empty() && !src.empty()) {
1295
    for (size_t i = 0; i < dest_len; ++i) {
Guolin Ke's avatar
Guolin Ke committed
1296
      dest->push_back(deflt);
1297
1298
1299
1300
1301
    }
    PushVector(dest, src);
  }
}

1302
void Dataset::AddFeaturesFrom(Dataset* other) {
1303
  if (other->num_data_ != num_data_) {
1304
    Log::Fatal(
Guolin Ke's avatar
Guolin Ke committed
1305
1306
        "Cannot add features from other Dataset with a different number of "
        "rows");
1307
  }
1308
1309
1310
  if (other->has_raw_ != has_raw_) {
    Log::Fatal("Can only add features from other Dataset if both or neither have raw data.");
  }
Guolin Ke's avatar
Guolin Ke committed
1311
1312
1313
1314
1315
  int mv_gid = -1;
  int other_mv_gid = -1;
  for (int i = 0; i < num_groups_; ++i) {
    if (IsMultiGroup(i)) {
      mv_gid = i;
1316
1317
    }
  }
Guolin Ke's avatar
Guolin Ke committed
1318
1319
1320
1321
  for (int i = 0; i < other->num_groups_; ++i) {
    if (other->IsMultiGroup(i)) {
      other_mv_gid = i;
    }
1322
  }
Guolin Ke's avatar
Guolin Ke committed
1323
1324
1325
1326
1327
1328
  // Only one multi-val group, just simply merge
  if (mv_gid < 0 || other_mv_gid < 0) {
    PushVector(&feature2subfeature_, other->feature2subfeature_);
    PushVector(&group_feature_cnt_, other->group_feature_cnt_);
    feature_groups_.reserve(other->feature_groups_.size());
    for (auto& fg : other->feature_groups_) {
1329
1330
      const int cur_group_id = static_cast<int>(feature_groups_.size());
      feature_groups_.emplace_back(new FeatureGroup(*fg, true, cur_group_id));
Guolin Ke's avatar
Guolin Ke committed
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
    }
    for (auto feature_idx : other->used_feature_map_) {
      if (feature_idx >= 0) {
        used_feature_map_.push_back(feature_idx + num_features_);
      } else {
        used_feature_map_.push_back(-1);  // Unused feature.
      }
    }
    PushOffset(&real_feature_idx_, other->real_feature_idx_,
               num_total_features_);
    PushOffset(&feature2group_, other->feature2group_, num_groups_);
    auto bin_offset = group_bin_boundaries_.back();
    // Skip the leading 0 when copying group_bin_boundaries.
    for (auto i = other->group_bin_boundaries_.begin() + 1;
         i < other->group_bin_boundaries_.end(); ++i) {
      group_bin_boundaries_.push_back(*i + bin_offset);
    }
    PushOffset(&group_feature_start_, other->group_feature_start_,
               num_features_);
    num_groups_ += other->num_groups_;
    num_features_ += other->num_features_;
  } else {
    std::vector<std::vector<int>> features_in_group;
    for (int i = 0; i < num_groups_; ++i) {
      int f_start = group_feature_start_[i];
      int f_cnt = group_feature_cnt_[i];
      features_in_group.emplace_back();
      for (int j = 0; j < f_cnt; ++j) {
1359
1360
        const int real_fidx = real_feature_idx_[f_start + j];
        features_in_group.back().push_back(real_fidx);
Guolin Ke's avatar
Guolin Ke committed
1361
1362
1363
      }
    }
    feature_groups_[mv_gid]->AddFeaturesFrom(
1364
        other->feature_groups_[other_mv_gid].get(), mv_gid);
Guolin Ke's avatar
Guolin Ke committed
1365
1366
1367
1368
1369
    for (int i = 0; i < other->num_groups_; ++i) {
      int f_start = other->group_feature_start_[i];
      int f_cnt = other->group_feature_cnt_[i];
      if (i == other_mv_gid) {
        for (int j = 0; j < f_cnt; ++j) {
1370
1371
          const int real_fidx = other->real_feature_idx_[f_start + j] + num_total_features_;
          features_in_group[mv_gid].push_back(real_fidx);
Guolin Ke's avatar
Guolin Ke committed
1372
1373
1374
1375
        }
      } else {
        features_in_group.emplace_back();
        for (int j = 0; j < f_cnt; ++j) {
1376
1377
          const int real_fidx = other->real_feature_idx_[f_start + j] + num_total_features_;
          features_in_group.back().push_back(real_fidx);
Guolin Ke's avatar
Guolin Ke committed
1378
1379
        }
        feature_groups_.emplace_back(
1380
            new FeatureGroup(*other->feature_groups_[i], false, -1));
Guolin Ke's avatar
Guolin Ke committed
1381
1382
1383
1384
1385
1386
1387
      }
    }
    // regenerate other fields
    num_groups_ += other->num_groups_ - 1;
    CHECK(num_groups_ == static_cast<int>(features_in_group.size()));
    num_features_ += other->num_features_;
    int cur_fidx = 0;
1388
1389
    used_feature_map_ =
      std::vector<int>(num_total_features_ + other->num_total_features_, -1);
Guolin Ke's avatar
Guolin Ke committed
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
    real_feature_idx_.resize(num_features_);
    feature2group_.resize(num_features_);
    feature2subfeature_.resize(num_features_);
    group_feature_start_.resize(num_groups_);
    group_feature_cnt_.resize(num_groups_);

    group_bin_boundaries_.clear();
    uint64_t num_total_bin = 0;
    group_bin_boundaries_.push_back(num_total_bin);
    for (int i = 0; i < num_groups_; ++i) {
      auto cur_features = features_in_group[i];
      int cur_cnt_features = static_cast<int>(cur_features.size());
      group_feature_start_[i] = cur_fidx;
      group_feature_cnt_[i] = cur_cnt_features;
      for (int j = 0; j < cur_cnt_features; ++j) {
        int real_fidx = cur_features[j];
        used_feature_map_[real_fidx] = cur_fidx;
        real_feature_idx_[cur_fidx] = real_fidx;
        feature2group_[cur_fidx] = i;
        feature2subfeature_[cur_fidx] = j;
        ++cur_fidx;
      }
      num_total_bin += feature_groups_[i]->num_total_bin_;
      group_bin_boundaries_.push_back(num_total_bin);
    }
  }
  std::unordered_set<std::string> feature_names_set;
  for (const auto& val : feature_names_) {
    feature_names_set.emplace(val);
  }
  for (const auto& val : other->feature_names_) {
    std::string new_name = val;
    int cnt = 2;
    while (feature_names_set.count(new_name)) {
      new_name = "D" + std::to_string(cnt) + "_" + val;
      ++cnt;
    }
    if (new_name != val) {
      Log::Warning(
        "Find the same feature name (%s) in Dataset::AddFeaturesFrom, change "
        "its name to (%s)",
        val.c_str(), new_name.c_str());
    }
    feature_names_set.emplace(new_name);
    feature_names_.push_back(new_name);
  }
  PushVector(&forced_bin_bounds_, other->forced_bin_bounds_);
  PushClearIfEmpty(&max_bin_by_feature_, num_total_features_,
                   other->max_bin_by_feature_, other->num_total_features_, -1);
1439
  num_total_features_ += other->num_total_features_;
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
  for (size_t i = 0; i < (other->numeric_feature_map_).size(); ++i) {
    int feat_ind = numeric_feature_map_[i];
    if (feat_ind > -1) {
      numeric_feature_map_.push_back(feat_ind + num_numeric_features_);
    } else {
      numeric_feature_map_.push_back(-1);
    }
  }
  num_numeric_features_ += other->num_numeric_features_;
  if (has_raw_) {
    for (int i = 0; i < other->num_numeric_features_; ++i) {
      raw_data_.push_back(other->raw_data_[i]);
    }
  }
1454
1455
}

Guolin Ke's avatar
Guolin Ke committed
1456
}  // namespace LightGBM