dataset.cpp 40.5 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
#include <LightGBM/dataset.h>
6

Guolin Ke's avatar
Guolin Ke committed
7
#include <LightGBM/feature_group.h>
8
#include <LightGBM/utils/array_args.h>
9
#include <LightGBM/utils/openmp_wrapper.h>
Guolin Ke's avatar
Guolin Ke committed
10
#include <LightGBM/utils/threading.h>
Guolin Ke's avatar
Guolin Ke committed
11

12
#include <limits>
zhangyafeikimi's avatar
zhangyafeikimi committed
13
#include <chrono>
Guolin Ke's avatar
Guolin Ke committed
14
#include <cstdio>
Guolin Ke's avatar
Guolin Ke committed
15
#include <sstream>
16
#include <unordered_map>
Guolin Ke's avatar
Guolin Ke committed
17

18

Guolin Ke's avatar
Guolin Ke committed
19
20
namespace LightGBM {

21
const char* Dataset::binary_file_token = "______LightGBM_Binary_File_Token______\n";
Guolin Ke's avatar
Guolin Ke committed
22

Guolin Ke's avatar
Guolin Ke committed
23
Dataset::Dataset() {
24
  data_filename_ = "noname";
Guolin Ke's avatar
Guolin Ke committed
25
  num_data_ = 0;
Guolin Ke's avatar
Guolin Ke committed
26
  is_finish_load_ = false;
Guolin Ke's avatar
Guolin Ke committed
27
28
}

29
Dataset::Dataset(data_size_t num_data) {
Guolin Ke's avatar
Guolin Ke committed
30
  CHECK(num_data > 0);
Guolin Ke's avatar
Guolin Ke committed
31
  data_filename_ = "noname";
Guolin Ke's avatar
Guolin Ke committed
32
  num_data_ = num_data;
Guolin Ke's avatar
Guolin Ke committed
33
  metadata_.Init(num_data_, NO_SPECIFIC, NO_SPECIFIC);
Guolin Ke's avatar
Guolin Ke committed
34
  is_finish_load_ = false;
Guolin Ke's avatar
Guolin Ke committed
35
  group_bin_boundaries_.push_back(0);
Guolin Ke's avatar
Guolin Ke committed
36
37
}

Guolin Ke's avatar
Guolin Ke committed
38
Dataset::~Dataset() {
Guolin Ke's avatar
Guolin Ke committed
39
}
Guolin Ke's avatar
Guolin Ke committed
40

Guolin Ke's avatar
Guolin Ke committed
41
42
43
44
45
46
47
48
49
50
std::vector<std::vector<int>> NoGroup(
  const std::vector<int>& used_features) {
  std::vector<std::vector<int>> features_in_group;
  features_in_group.resize(used_features.size());
  for (size_t i = 0; i < used_features.size(); ++i) {
    features_in_group[i].emplace_back(used_features[i]);
  }
  return features_in_group;
}

Guolin Ke's avatar
Guolin Ke committed
51
52
53
54
55
56
57
58
59
60
61
62
int GetConfilctCount(const std::vector<bool>& mark, const int* indices, int num_indices, int max_cnt) {
  int ret = 0;
  for (int i = 0; i < num_indices; ++i) {
    if (mark[indices[i]]) {
      ++ret;
      if (ret > max_cnt) {
        return -1;
      }
    }
  }
  return ret;
}
Guolin Ke's avatar
Guolin Ke committed
63
void MarkUsed(std::vector<bool>* mark, const int* indices, int num_indices) {
Guolin Ke's avatar
Guolin Ke committed
64
  for (int i = 0; i < num_indices; ++i) {
Guolin Ke's avatar
Guolin Ke committed
65
    mark->at(indices[i]) = true;
Guolin Ke's avatar
Guolin Ke committed
66
67
68
69
70
71
72
73
74
75
  }
}

std::vector<std::vector<int>> FindGroups(const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
                                         const std::vector<int>& find_order,
                                         int** sample_indices,
                                         const int* num_per_col,
                                         size_t total_sample_cnt,
                                         data_size_t max_error_cnt,
                                         data_size_t filter_cnt,
Guolin Ke's avatar
Guolin Ke committed
76
77
                                         data_size_t num_data,
                                         bool is_use_gpu) {
Guolin Ke's avatar
Guolin Ke committed
78
  const int max_search_group = 100;
Guolin Ke's avatar
Guolin Ke committed
79
  const int gpu_max_bin_per_group = 256;
Guolin Ke's avatar
Guolin Ke committed
80
81
82
83
84
85
86
87
88
89
90
91
  Random rand(num_data);
  std::vector<std::vector<int>> features_in_group;
  std::vector<std::vector<bool>> conflict_marks;
  std::vector<int> group_conflict_cnt;
  std::vector<size_t> group_non_zero_cnt;
  std::vector<int> group_num_bin;

  for (auto fidx : find_order) {
    const size_t cur_non_zero_cnt = num_per_col[fidx];
    bool need_new_group = true;
    std::vector<int> available_groups;
    for (int gid = 0; gid < static_cast<int>(features_in_group.size()); ++gid) {
92
      if (group_non_zero_cnt[gid] + cur_non_zero_cnt <= total_sample_cnt + max_error_cnt) {
Guolin Ke's avatar
Guolin Ke committed
93
94
95
96
        if (!is_use_gpu || group_num_bin[gid] + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0)
            <= gpu_max_bin_per_group) {
          available_groups.push_back(gid);
        }
Guolin Ke's avatar
Guolin Ke committed
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
      }
    }
    std::vector<int> search_groups;
    if (!available_groups.empty()) {
      int last = static_cast<int>(available_groups.size()) - 1;
      auto indices = rand.Sample(last, std::min(last, max_search_group - 1));
      search_groups.push_back(available_groups.back());
      for (auto idx : indices) {
        search_groups.push_back(available_groups[idx]);
      }
    }
    for (auto gid : search_groups) {
      const int rest_max_cnt = max_error_cnt - group_conflict_cnt[gid];
      int cnt = GetConfilctCount(conflict_marks[gid], sample_indices[fidx], num_per_col[fidx], rest_max_cnt);
      if (cnt >= 0 && cnt <= rest_max_cnt) {
        data_size_t rest_non_zero_data = static_cast<data_size_t>(
          static_cast<double>(cur_non_zero_cnt - cnt) * num_data / total_sample_cnt);
        if (rest_non_zero_data < filter_cnt) { continue; }
        need_new_group = false;
        features_in_group[gid].push_back(fidx);
        group_conflict_cnt[gid] += cnt;
        group_non_zero_cnt[gid] += cur_non_zero_cnt - cnt;
Guolin Ke's avatar
Guolin Ke committed
119
        MarkUsed(&conflict_marks[gid], sample_indices[fidx], num_per_col[fidx]);
Guolin Ke's avatar
Guolin Ke committed
120
121
122
        if (is_use_gpu) {
          group_num_bin[gid] += bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0);
        }
Guolin Ke's avatar
Guolin Ke committed
123
124
125
126
127
128
129
130
        break;
      }
    }
    if (need_new_group) {
      features_in_group.emplace_back();
      features_in_group.back().push_back(fidx);
      group_conflict_cnt.push_back(0);
      conflict_marks.emplace_back(total_sample_cnt, false);
Guolin Ke's avatar
Guolin Ke committed
131
      MarkUsed(&(conflict_marks.back()), sample_indices[fidx], num_per_col[fidx]);
Guolin Ke's avatar
Guolin Ke committed
132
      group_non_zero_cnt.emplace_back(cur_non_zero_cnt);
Guolin Ke's avatar
Guolin Ke committed
133
134
135
      if (is_use_gpu) {
        group_num_bin.push_back(1 + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0));
      }
Guolin Ke's avatar
Guolin Ke committed
136
137
138
139
140
    }
  }
  return features_in_group;
}

Guolin Ke's avatar
Guolin Ke committed
141
std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
Guolin Ke's avatar
Guolin Ke committed
142
143
144
145
146
147
148
149
                                                  int** sample_indices,
                                                  const int* num_per_col,
                                                  size_t total_sample_cnt,
                                                  const std::vector<int>& used_features,
                                                  double max_conflict_rate,
                                                  data_size_t num_data,
                                                  data_size_t min_data,
                                                  double sparse_threshold,
Guolin Ke's avatar
Guolin Ke committed
150
151
                                                  bool is_enable_sparse,
                                                  bool is_use_gpu) {
Guolin Ke's avatar
Guolin Ke committed
152
153
154
155
  // filter is based on sampling data, so decrease its range
  const data_size_t filter_cnt = static_cast<data_size_t>(static_cast<double>(0.95 * min_data) / num_data * total_sample_cnt);
  const data_size_t max_error_cnt = static_cast<data_size_t>(total_sample_cnt * max_conflict_rate);
  std::vector<size_t> feature_non_zero_cnt;
156
  feature_non_zero_cnt.reserve(used_features.size());
Guolin Ke's avatar
Guolin Ke committed
157
158
159
160
161
162
  // put dense feature first
  for (auto fidx : used_features) {
    feature_non_zero_cnt.emplace_back(num_per_col[fidx]);
  }
  // sort by non zero cnt
  std::vector<int> sorted_idx;
163
  sorted_idx.reserve(used_features.size());
164
  for (int i = 0; i < static_cast<int>(used_features.size()); ++i) {
Guolin Ke's avatar
Guolin Ke committed
165
166
167
    sorted_idx.emplace_back(i);
  }
  // sort by non zero cnt, bigger first
168
169
  std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
                   [&feature_non_zero_cnt](int a, int b) {
Guolin Ke's avatar
Guolin Ke committed
170
171
172
173
    return feature_non_zero_cnt[a] > feature_non_zero_cnt[b];
  });

  std::vector<int> feature_order_by_cnt;
174
  feature_order_by_cnt.reserve(sorted_idx.size());
Guolin Ke's avatar
Guolin Ke committed
175
176
177
  for (auto sidx : sorted_idx) {
    feature_order_by_cnt.push_back(used_features[sidx]);
  }
Guolin Ke's avatar
Guolin Ke committed
178
179
  auto features_in_group = FindGroups(bin_mappers, used_features, sample_indices, num_per_col, total_sample_cnt, max_error_cnt, filter_cnt, num_data, is_use_gpu);
  auto group2 = FindGroups(bin_mappers, feature_order_by_cnt, sample_indices, num_per_col, total_sample_cnt, max_error_cnt, filter_cnt, num_data, is_use_gpu);
Guolin Ke's avatar
Guolin Ke committed
180
181
182
183
184
185
186
187
188
189
190
191
192
193
  if (features_in_group.size() > group2.size()) {
    features_in_group = group2;
  }
  std::vector<std::vector<int>> ret;
  for (size_t i = 0; i < features_in_group.size(); ++i) {
    if (features_in_group[i].size() <= 1 || features_in_group[i].size() >= 5) {
      ret.push_back(features_in_group[i]);
    } else {
      int cnt_non_zero = 0;
      for (size_t j = 0; j < features_in_group[i].size(); ++j) {
        const int fidx = features_in_group[i][j];
        cnt_non_zero += static_cast<int>(num_data * (1.0f - bin_mappers[fidx]->sparse_rate()));
      }
      double sparse_rate = 1.0f - static_cast<double>(cnt_non_zero) / (num_data);
194
      // take apart small sparse group, due it will not gain on speed
Guolin Ke's avatar
Guolin Ke committed
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
      if (sparse_rate >= sparse_threshold && is_enable_sparse) {
        for (size_t j = 0; j < features_in_group[i].size(); ++j) {
          const int fidx = features_in_group[i][j];
          ret.emplace_back();
          ret.back().push_back(fidx);
        }
      } else {
        ret.push_back(features_in_group[i]);
      }
    }
  }
  // shuffle groups
  int num_group = static_cast<int>(ret.size());
  Random tmp_rand(12);
  for (int i = 0; i < num_group - 1; ++i) {
    int j = tmp_rand.NextShort(i + 1, num_group);
    std::swap(ret[i], ret[j]);
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
216
void Dataset::Construct(
Guolin Ke's avatar
Guolin Ke committed
217
  std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
218
  const std::vector<std::vector<double>>& forced_bins,
Guolin Ke's avatar
Guolin Ke committed
219
220
221
  int** sample_non_zero_indices,
  const int* num_per_col,
  size_t total_sample_cnt,
Guolin Ke's avatar
Guolin Ke committed
222
  const Config& io_config) {
Guolin Ke's avatar
Guolin Ke committed
223
  num_total_features_ = static_cast<int>(bin_mappers->size());
224
  sparse_threshold_ = io_config.sparse_threshold;
Guolin Ke's avatar
Guolin Ke committed
225
226
  // get num_features
  std::vector<int> used_features;
Guolin Ke's avatar
Guolin Ke committed
227
228
  for (int i = 0; i < static_cast<int>(bin_mappers->size()); ++i) {
    if (bin_mappers->at(i) != nullptr && !bin_mappers->at(i)->is_trivial()) {
Guolin Ke's avatar
Guolin Ke committed
229
      used_features.emplace_back(i);
Guolin Ke's avatar
Guolin Ke committed
230
    }
Guolin Ke's avatar
Guolin Ke committed
231
  }
Guolin Ke's avatar
Guolin Ke committed
232
  if (used_features.empty()) {
233
    Log::Warning("There are no meaningful features, as all feature values are constant.");
Guolin Ke's avatar
Guolin Ke committed
234
  }
Guolin Ke's avatar
Guolin Ke committed
235
236
  auto features_in_group = NoGroup(used_features);

237
  if (io_config.enable_bundle && !used_features.empty()) {
Guolin Ke's avatar
Guolin Ke committed
238
    features_in_group = FastFeatureBundling(*bin_mappers,
Guolin Ke's avatar
Guolin Ke committed
239
240
241
                                            sample_non_zero_indices, num_per_col, total_sample_cnt,
                                            used_features, io_config.max_conflict_rate,
                                            num_data_, io_config.min_data_in_leaf,
Guolin Ke's avatar
Guolin Ke committed
242
                                            sparse_threshold_, io_config.is_enable_sparse, io_config.device_type == std::string("gpu"));
Guolin Ke's avatar
Guolin Ke committed
243
244
  }

Guolin Ke's avatar
Guolin Ke committed
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
  num_features_ = 0;
  for (const auto& fs : features_in_group) {
    num_features_ += static_cast<int>(fs.size());
  }
  int cur_fidx = 0;
  used_feature_map_ = std::vector<int>(num_total_features_, -1);
  num_groups_ = static_cast<int>(features_in_group.size());
  real_feature_idx_.resize(num_features_);
  feature2group_.resize(num_features_);
  feature2subfeature_.resize(num_features_);
  for (int i = 0; i < num_groups_; ++i) {
    auto cur_features = features_in_group[i];
    int cur_cnt_features = static_cast<int>(cur_features.size());
    // get bin_mappers
    std::vector<std::unique_ptr<BinMapper>> cur_bin_mappers;
    for (int j = 0; j < cur_cnt_features; ++j) {
      int real_fidx = cur_features[j];
      used_feature_map_[real_fidx] = cur_fidx;
      real_feature_idx_[cur_fidx] = real_fidx;
      feature2group_[cur_fidx] = i;
      feature2subfeature_[cur_fidx] = j;
Guolin Ke's avatar
Guolin Ke committed
266
      cur_bin_mappers.emplace_back(bin_mappers->at(real_fidx).release());
Guolin Ke's avatar
Guolin Ke committed
267
268
269
      ++cur_fidx;
    }
    feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(
Guolin Ke's avatar
Guolin Ke committed
270
      new FeatureGroup(cur_cnt_features, &cur_bin_mappers, num_data_, sparse_threshold_,
Guolin Ke's avatar
Guolin Ke committed
271
                       io_config.is_enable_sparse)));
Guolin Ke's avatar
Guolin Ke committed
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
  }
  feature_groups_.shrink_to_fit();
  group_bin_boundaries_.clear();
  uint64_t num_total_bin = 0;
  group_bin_boundaries_.push_back(num_total_bin);
  for (int i = 0; i < num_groups_; ++i) {
    num_total_bin += feature_groups_[i]->num_total_bin_;
    group_bin_boundaries_.push_back(num_total_bin);
  }
  int last_group = 0;
  group_feature_start_.reserve(num_groups_);
  group_feature_cnt_.reserve(num_groups_);
  group_feature_start_.push_back(0);
  group_feature_cnt_.push_back(1);
  for (int i = 1; i < num_features_; ++i) {
    const int group = feature2group_[i];
    if (group == last_group) {
      group_feature_cnt_.back() = group_feature_cnt_.back() + 1;
    } else {
      group_feature_start_.push_back(i);
      group_feature_cnt_.push_back(1);
      last_group = group;
    }
  }
Guolin Ke's avatar
Guolin Ke committed
296
297
298
299
300
301
302
303
304
305
306
307
308
309

  if (!io_config.monotone_constraints.empty()) {
    CHECK(static_cast<size_t>(num_total_features_) == io_config.monotone_constraints.size());
    monotone_types_.resize(num_features_);
    for (int i = 0; i < num_total_features_; ++i) {
      int inner_fidx = InnerFeatureIndex(i);
      if (inner_fidx >= 0) {
        monotone_types_[inner_fidx] = io_config.monotone_constraints[i];
      }
    }
    if (ArrayArgs<int8_t>::CheckAllZero(monotone_types_)) {
      monotone_types_.clear();
    }
  }
Guolin Ke's avatar
Guolin Ke committed
310
311
312
313
314
315
316
317
318
319
320
321
322
  if (!io_config.feature_contri.empty()) {
    CHECK(static_cast<size_t>(num_total_features_) == io_config.feature_contri.size());
    feature_penalty_.resize(num_features_);
    for (int i = 0; i < num_total_features_; ++i) {
      int inner_fidx = InnerFeatureIndex(i);
      if (inner_fidx >= 0) {
        feature_penalty_[inner_fidx] = std::max(0.0, io_config.feature_contri[i]);
      }
    }
    if (ArrayArgs<double>::CheckAll(feature_penalty_, 1.0)) {
      feature_penalty_.clear();
    }
  }
Belinda Trotta's avatar
Belinda Trotta committed
323
324
325
326
327
328
  if (!io_config.max_bin_by_feature.empty()) {
    CHECK(static_cast<size_t>(num_total_features_) == io_config.max_bin_by_feature.size());
    CHECK(*(std::min_element(io_config.max_bin_by_feature.begin(), io_config.max_bin_by_feature.end())) > 1);
    max_bin_by_feature_.resize(num_total_features_);
    max_bin_by_feature_.assign(io_config.max_bin_by_feature.begin(), io_config.max_bin_by_feature.end());
  }
329
  forced_bin_bounds_ = forced_bins;
330
331
332
333
334
335
336
337
338
339
340
341
342
343
  max_bin_ = io_config.max_bin;
  min_data_in_bin_ = io_config.min_data_in_bin;
  bin_construct_sample_cnt_ = io_config.bin_construct_sample_cnt;
  use_missing_ = io_config.use_missing;
  zero_as_missing_ = io_config.zero_as_missing;
}

void Dataset::ResetConfig(const char* parameters) {
  auto param = Config::Str2Map(parameters);
  Config io_config;
  io_config.Set(param);
  if (param.count("max_bin") && io_config.max_bin != max_bin_) {
    Log::Warning("Cannot change max_bin after constructed Dataset handle.");
  }
Belinda Trotta's avatar
Belinda Trotta committed
344
345
346
  if (param.count("max_bin_by_feature") && io_config.max_bin_by_feature != max_bin_by_feature_) {
    Log::Warning("Cannot change max_bin_by_feature after constructed Dataset handle.");
  }
347
348
349
350
351
352
353
354
355
356
357
358
  if (param.count("bin_construct_sample_cnt") && io_config.bin_construct_sample_cnt != bin_construct_sample_cnt_) {
    Log::Warning("Cannot change bin_construct_sample_cnt after constructed Dataset handle.");
  }
  if (param.count("min_data_in_bin") && io_config.min_data_in_bin != min_data_in_bin_) {
    Log::Warning("Cannot change min_data_in_bin after constructed Dataset handle.");
  }
  if (param.count("use_missing") && io_config.use_missing != use_missing_) {
    Log::Warning("Cannot change use_missing after constructed Dataset handle.");
  }
  if (param.count("zero_as_missing") && io_config.zero_as_missing != zero_as_missing_) {
    Log::Warning("Cannot change zero_as_missing after constructed Dataset handle.");
  }
Guolin Ke's avatar
Guolin Ke committed
359
360
361
  if (param.count("sparse_threshold") && io_config.sparse_threshold != sparse_threshold_) {
    Log::Warning("Cannot change sparse_threshold after constructed Dataset handle.");
  }
362
363
364
  if (param.count("forcedbins_filename")) {
    Log::Warning("Cannot change forced bins after constructed Dataset handle.");
  }
Guolin Ke's avatar
Guolin Ke committed
365

366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
  if (!io_config.monotone_constraints.empty()) {
    CHECK(static_cast<size_t>(num_total_features_) == io_config.monotone_constraints.size());
    monotone_types_.resize(num_features_);
    for (int i = 0; i < num_total_features_; ++i) {
      int inner_fidx = InnerFeatureIndex(i);
      if (inner_fidx >= 0) {
        monotone_types_[inner_fidx] = io_config.monotone_constraints[i];
      }
    }
    if (ArrayArgs<int8_t>::CheckAllZero(monotone_types_)) {
      monotone_types_.clear();
    }
  }
  if (!io_config.feature_contri.empty()) {
    CHECK(static_cast<size_t>(num_total_features_) == io_config.feature_contri.size());
    feature_penalty_.resize(num_features_);
    for (int i = 0; i < num_total_features_; ++i) {
      int inner_fidx = InnerFeatureIndex(i);
      if (inner_fidx >= 0) {
        feature_penalty_[inner_fidx] = std::max(0.0, io_config.feature_contri[i]);
      }
    }
    if (ArrayArgs<double>::CheckAll(feature_penalty_, 1.0)) {
      feature_penalty_.clear();
    }
  }
Guolin Ke's avatar
Guolin Ke committed
392
393
}

Guolin Ke's avatar
Guolin Ke committed
394
void Dataset::FinishLoad() {
Guolin Ke's avatar
Guolin Ke committed
395
  if (is_finish_load_) { return; }
396
397
398
399
400
401
402
403
404
  if (num_groups_ > 0) {
    OMP_INIT_EX();
#pragma omp parallel for schedule(guided)
    for (int i = 0; i < num_groups_; ++i) {
      OMP_LOOP_EX_BEGIN();
      feature_groups_[i]->bin_data_->FinishLoad();
      OMP_LOOP_EX_END();
    }
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
405
  }
Guolin Ke's avatar
Guolin Ke committed
406
  is_finish_load_ = true;
Guolin Ke's avatar
Guolin Ke committed
407
}
Guolin Ke's avatar
Guolin Ke committed
408

409
void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
Guolin Ke's avatar
Guolin Ke committed
410
  feature_groups_.clear();
Guolin Ke's avatar
Guolin Ke committed
411
  num_features_ = dataset->num_features_;
Guolin Ke's avatar
Guolin Ke committed
412
  num_groups_ = dataset->num_groups_;
413
  sparse_threshold_ = dataset->sparse_threshold_;
Guolin Ke's avatar
Guolin Ke committed
414
  // copy feature bin mapper data
Guolin Ke's avatar
Guolin Ke committed
415
416
417
418
419
420
421
  for (int i = 0; i < num_groups_; ++i) {
    std::vector<std::unique_ptr<BinMapper>> bin_mappers;
    for (int j = 0; j < dataset->feature_groups_[i]->num_feature_; ++j) {
      bin_mappers.emplace_back(new BinMapper(*(dataset->feature_groups_[i]->bin_mappers_[j])));
    }
    feature_groups_.emplace_back(new FeatureGroup(
      dataset->feature_groups_[i]->num_feature_,
Guolin Ke's avatar
Guolin Ke committed
422
      &bin_mappers,
Guolin Ke's avatar
Guolin Ke committed
423
      num_data_,
Guolin Ke's avatar
Guolin Ke committed
424
      dataset->feature_groups_[i]->is_sparse_));
Guolin Ke's avatar
Guolin Ke committed
425
  }
Guolin Ke's avatar
Guolin Ke committed
426
  feature_groups_.shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
427
428
429
  used_feature_map_ = dataset->used_feature_map_;
  num_total_features_ = dataset->num_total_features_;
  feature_names_ = dataset->feature_names_;
Guolin Ke's avatar
Guolin Ke committed
430
  label_idx_ = dataset->label_idx_;
Guolin Ke's avatar
Guolin Ke committed
431
432
433
434
435
436
  real_feature_idx_ = dataset->real_feature_idx_;
  feature2group_ = dataset->feature2group_;
  feature2subfeature_ = dataset->feature2subfeature_;
  group_bin_boundaries_ = dataset->group_bin_boundaries_;
  group_feature_start_ = dataset->group_feature_start_;
  group_feature_cnt_ = dataset->group_feature_cnt_;
Guolin Ke's avatar
Guolin Ke committed
437
  monotone_types_ = dataset->monotone_types_;
Guolin Ke's avatar
Guolin Ke committed
438
  feature_penalty_ = dataset->feature_penalty_;
439
  forced_bin_bounds_ = dataset->forced_bin_bounds_;
Guolin Ke's avatar
Guolin Ke committed
440
441
442
443
444
445
}

void Dataset::CreateValid(const Dataset* dataset) {
  feature_groups_.clear();
  num_features_ = dataset->num_features_;
  num_groups_ = num_features_;
446
  sparse_threshold_ = dataset->sparse_threshold_;
Guolin Ke's avatar
Guolin Ke committed
447
448
449
450
451
452
453
454
455
  bool is_enable_sparse = true;
  feature2group_.clear();
  feature2subfeature_.clear();
  // copy feature bin mapper data
  for (int i = 0; i < num_features_; ++i) {
    std::vector<std::unique_ptr<BinMapper>> bin_mappers;
    bin_mappers.emplace_back(new BinMapper(*(dataset->FeatureBinMapper(i))));
    feature_groups_.emplace_back(new FeatureGroup(
      1,
Guolin Ke's avatar
Guolin Ke committed
456
      &bin_mappers,
Guolin Ke's avatar
Guolin Ke committed
457
      num_data_,
Guolin Ke's avatar
Guolin Ke committed
458
      sparse_threshold_,
Guolin Ke's avatar
Guolin Ke committed
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
      is_enable_sparse));
    feature2group_.push_back(i);
    feature2subfeature_.push_back(0);
  }

  feature_groups_.shrink_to_fit();
  used_feature_map_ = dataset->used_feature_map_;
  num_total_features_ = dataset->num_total_features_;
  feature_names_ = dataset->feature_names_;
  label_idx_ = dataset->label_idx_;
  real_feature_idx_ = dataset->real_feature_idx_;
  group_bin_boundaries_.clear();
  uint64_t num_total_bin = 0;
  group_bin_boundaries_.push_back(num_total_bin);
  for (int i = 0; i < num_groups_; ++i) {
    num_total_bin += feature_groups_[i]->num_total_bin_;
    group_bin_boundaries_.push_back(num_total_bin);
  }
  int last_group = 0;
  group_feature_start_.reserve(num_groups_);
  group_feature_cnt_.reserve(num_groups_);
  group_feature_start_.push_back(0);
  group_feature_cnt_.push_back(1);
  for (int i = 1; i < num_features_; ++i) {
    const int group = feature2group_[i];
    if (group == last_group) {
      group_feature_cnt_.back() = group_feature_cnt_.back() + 1;
    } else {
      group_feature_start_.push_back(i);
      group_feature_cnt_.push_back(1);
      last_group = group;
    }
  }
Guolin Ke's avatar
Guolin Ke committed
492
  monotone_types_ = dataset->monotone_types_;
Guolin Ke's avatar
Guolin Ke committed
493
  feature_penalty_ = dataset->feature_penalty_;
494
  forced_bin_bounds_ = dataset->forced_bin_bounds_;
Guolin Ke's avatar
Guolin Ke committed
495
496
}

Guolin Ke's avatar
Guolin Ke committed
497
498
499
void Dataset::ReSize(data_size_t num_data) {
  if (num_data_ != num_data) {
    num_data_ = num_data;
500
    OMP_INIT_EX();
Guolin Ke's avatar
Guolin Ke committed
501
    #pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
502
    for (int group = 0; group < num_groups_; ++group) {
503
      OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
504
      feature_groups_[group]->bin_data_->ReSize(num_data_);
505
      OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
506
    }
507
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
508
509
510
511
512
  }
}

void Dataset::CopySubset(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data) {
  CHECK(num_used_indices == num_data_);
513
  OMP_INIT_EX();
Guolin Ke's avatar
Guolin Ke committed
514
  #pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
515
  for (int group = 0; group < num_groups_; ++group) {
516
    OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
517
    feature_groups_[group]->CopySubset(fullset->feature_groups_[group].get(), used_indices, num_used_indices);
518
    OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
519
  }
520
  OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
521
  if (need_meta_data) {
Guolin Ke's avatar
Guolin Ke committed
522
    metadata_.Init(fullset->metadata_, used_indices, num_used_indices);
Guolin Ke's avatar
Guolin Ke committed
523
  }
Guolin Ke's avatar
Guolin Ke committed
524
  is_finish_load_ = true;
Guolin Ke's avatar
Guolin Ke committed
525
526
}

527
bool Dataset::SetFloatField(const char* field_name, const float* field_data, data_size_t num_element) {
Guolin Ke's avatar
Guolin Ke committed
528
529
530
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("label") || name == std::string("target")) {
531
    #ifdef LABEL_T_USE_DOUBLE
532
    Log::Fatal("Don't support LABEL_T_USE_DOUBLE");
533
    #else
534
    metadata_.SetLabel(field_data, num_element);
535
    #endif
Guolin Ke's avatar
Guolin Ke committed
536
  } else if (name == std::string("weight") || name == std::string("weights")) {
537
    #ifdef LABEL_T_USE_DOUBLE
538
    Log::Fatal("Don't support LABEL_T_USE_DOUBLE");
539
    #else
540
    metadata_.SetWeights(field_data, num_element);
541
    #endif
Guolin Ke's avatar
Guolin Ke committed
542
543
544
545
546
547
548
549
550
551
  } else {
    return false;
  }
  return true;
}

bool Dataset::SetDoubleField(const char* field_name, const double* field_data, data_size_t num_element) {
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("init_score")) {
552
    metadata_.SetInitScore(field_data, num_element);
Guolin Ke's avatar
Guolin Ke committed
553
  } else {
554
    return false;
Guolin Ke's avatar
Guolin Ke committed
555
  }
556
  return true;
Guolin Ke's avatar
Guolin Ke committed
557
558
}

559
560
561
562
bool Dataset::SetIntField(const char* field_name, const int* field_data, data_size_t num_element) {
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("query") || name == std::string("group")) {
Guolin Ke's avatar
Guolin Ke committed
563
    metadata_.SetQuery(field_data, num_element);
564
565
566
567
568
569
  } else {
    return false;
  }
  return true;
}

Guolin Ke's avatar
Guolin Ke committed
570
bool Dataset::GetFloatField(const char* field_name, data_size_t* out_len, const float** out_ptr) {
571
572
573
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("label") || name == std::string("target")) {
574
    #ifdef LABEL_T_USE_DOUBLE
575
    Log::Fatal("Don't support LABEL_T_USE_DOUBLE");
576
    #else
577
578
    *out_ptr = metadata_.label();
    *out_len = num_data_;
579
    #endif
580
  } else if (name == std::string("weight") || name == std::string("weights")) {
581
    #ifdef LABEL_T_USE_DOUBLE
582
    Log::Fatal("Don't support LABEL_T_USE_DOUBLE");
583
    #else
584
585
    *out_ptr = metadata_.weights();
    *out_len = num_data_;
586
    #endif
Guolin Ke's avatar
Guolin Ke committed
587
588
589
590
591
592
593
594
595
596
  } else {
    return false;
  }
  return true;
}

bool Dataset::GetDoubleField(const char* field_name, data_size_t* out_len, const double** out_ptr) {
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("init_score")) {
597
    *out_ptr = metadata_.init_score();
Guolin Ke's avatar
Guolin Ke committed
598
    *out_len = static_cast<data_size_t>(metadata_.num_init_score());
599
  } else if (name == std::string("feature_penalty")) {
600
    *out_ptr = feature_penalty_.data();
Guolin Ke's avatar
Guolin Ke committed
601
    *out_len = static_cast<data_size_t>(feature_penalty_.size());
602
  } else {
603
604
    return false;
  }
605
  return true;
606
607
}

Guolin Ke's avatar
Guolin Ke committed
608
bool Dataset::GetIntField(const char* field_name, data_size_t* out_len, const int** out_ptr) {
609
610
611
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("query") || name == std::string("group")) {
612
    *out_ptr = metadata_.query_boundaries();
Guolin Ke's avatar
Guolin Ke committed
613
    *out_len = metadata_.num_queries() + 1;
Guolin Ke's avatar
Guolin Ke committed
614
615
616
  } else {
    return false;
  }
617
  return true;
618
619
}

620
621
622
623
624
bool Dataset::GetInt8Field(const char* field_name, data_size_t* out_len, const int8_t** out_ptr) {
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("monotone_constraints")) {
    *out_ptr = monotone_types_.data();
Guolin Ke's avatar
Guolin Ke committed
625
    *out_len = static_cast<data_size_t>(monotone_types_.size());
626
627
628
629
630
631
  } else {
    return false;
  }
  return true;
}

Guolin Ke's avatar
Guolin Ke committed
632
void Dataset::SaveBinaryFile(const char* bin_filename) {
Guolin Ke's avatar
Guolin Ke committed
633
  if (bin_filename != nullptr
Guolin Ke's avatar
Guolin Ke committed
634
      && std::string(bin_filename) == data_filename_) {
635
    Log::Warning("Bianry file %s already exists", bin_filename);
Guolin Ke's avatar
Guolin Ke committed
636
637
    return;
  }
Guolin Ke's avatar
Guolin Ke committed
638
  // if not pass a filename, just append ".bin" of original file
Guolin Ke's avatar
Guolin Ke committed
639
  std::string bin_filename_str(data_filename_);
Guolin Ke's avatar
Guolin Ke committed
640
641
642
643
  if (bin_filename == nullptr || bin_filename[0] == '\0') {
    bin_filename_str.append(".bin");
    bin_filename = bin_filename_str.c_str();
  }
Guolin Ke's avatar
Guolin Ke committed
644
  bool is_file_existed = false;
645
646

  if (VirtualFileWriter::Exists(bin_filename)) {
Guolin Ke's avatar
Guolin Ke committed
647
    is_file_existed = true;
648
    Log::Warning("File %s exists, cannot save binary to it", bin_filename);
Guolin Ke's avatar
Guolin Ke committed
649
  }
Guolin Ke's avatar
Guolin Ke committed
650

Guolin Ke's avatar
Guolin Ke committed
651
  if (!is_file_existed) {
652
653
    auto writer = VirtualFileWriter::Make(bin_filename);
    if (!writer->Init()) {
Guolin Ke's avatar
Guolin Ke committed
654
      Log::Fatal("Cannot write binary data to %s ", bin_filename);
Guolin Ke's avatar
Guolin Ke committed
655
    }
656
    Log::Info("Saving data to binary file %s", bin_filename);
657
    size_t size_of_token = std::strlen(binary_file_token);
658
    writer->Write(binary_file_token, size_of_token);
Guolin Ke's avatar
Guolin Ke committed
659
    // get size of header
Guolin Ke's avatar
Guolin Ke committed
660
    size_t size_of_header = sizeof(num_data_) + sizeof(num_features_) + sizeof(num_total_features_)
Guolin Ke's avatar
Guolin Ke committed
661
      + sizeof(int) * num_total_features_ + sizeof(label_idx_) + sizeof(num_groups_) + sizeof(sparse_threshold_)
Guolin Ke's avatar
Guolin Ke committed
662
      + 3 * sizeof(int) * num_features_ + sizeof(uint64_t) * (num_groups_ + 1) + 2 * sizeof(int) * num_groups_ + sizeof(int8_t) * num_features_
Belinda Trotta's avatar
Belinda Trotta committed
663
      + sizeof(double) * num_features_ + sizeof(int32_t) * num_total_features_ + sizeof(int) * 3 + sizeof(bool) * 2;
664
665
666
667
    // size of feature names
    for (int i = 0; i < num_total_features_; ++i) {
      size_of_header += feature_names_[i].size() + sizeof(int);
    }
668
669
670
671
    // size of forced bins
    for (int i = 0; i < num_total_features_; ++i) {
      size_of_header += forced_bin_bounds_[i].size() * sizeof(double) + sizeof(int);
    }
672
    writer->Write(&size_of_header, sizeof(size_of_header));
Guolin Ke's avatar
Guolin Ke committed
673
    // write header
674
675
676
677
    writer->Write(&num_data_, sizeof(num_data_));
    writer->Write(&num_features_, sizeof(num_features_));
    writer->Write(&num_total_features_, sizeof(num_total_features_));
    writer->Write(&label_idx_, sizeof(label_idx_));
678
679
680
681
682
    writer->Write(&max_bin_, sizeof(max_bin_));
    writer->Write(&bin_construct_sample_cnt_, sizeof(bin_construct_sample_cnt_));
    writer->Write(&min_data_in_bin_, sizeof(min_data_in_bin_));
    writer->Write(&use_missing_, sizeof(use_missing_));
    writer->Write(&zero_as_missing_, sizeof(zero_as_missing_));
Guolin Ke's avatar
Guolin Ke committed
683
    writer->Write(&sparse_threshold_, sizeof(sparse_threshold_));
684
685
686
687
688
689
690
691
    writer->Write(used_feature_map_.data(), sizeof(int) * num_total_features_);
    writer->Write(&num_groups_, sizeof(num_groups_));
    writer->Write(real_feature_idx_.data(), sizeof(int) * num_features_);
    writer->Write(feature2group_.data(), sizeof(int) * num_features_);
    writer->Write(feature2subfeature_.data(), sizeof(int) * num_features_);
    writer->Write(group_bin_boundaries_.data(), sizeof(uint64_t) * (num_groups_ + 1));
    writer->Write(group_feature_start_.data(), sizeof(int) * num_groups_);
    writer->Write(group_feature_cnt_.data(), sizeof(int) * num_groups_);
Guolin Ke's avatar
Guolin Ke committed
692
693
694
695
696
697
698
    if (monotone_types_.empty()) {
      ArrayArgs<int8_t>::Assign(&monotone_types_, 0, num_features_);
    }
    writer->Write(monotone_types_.data(), sizeof(int8_t) * num_features_);
    if (ArrayArgs<int8_t>::CheckAllZero(monotone_types_)) {
      monotone_types_.clear();
    }
Guolin Ke's avatar
Guolin Ke committed
699
700
701
702
703
704
705
    if (feature_penalty_.empty()) {
      ArrayArgs<double>::Assign(&feature_penalty_, 1.0, num_features_);
    }
    writer->Write(feature_penalty_.data(), sizeof(double) * num_features_);
    if (ArrayArgs<double>::CheckAll(feature_penalty_, 1.0)) {
      feature_penalty_.clear();
    }
Belinda Trotta's avatar
Belinda Trotta committed
706
707
708
709
710
711
712
    if (max_bin_by_feature_.empty()) {
      ArrayArgs<int32_t>::Assign(&max_bin_by_feature_, -1, num_total_features_);
    }
    writer->Write(max_bin_by_feature_.data(), sizeof(int32_t) * num_total_features_);
    if (ArrayArgs<int32_t>::CheckAll(max_bin_by_feature_, -1)) {
      max_bin_by_feature_.clear();
    }
713
714
715
    // write feature names
    for (int i = 0; i < num_total_features_; ++i) {
      int str_len = static_cast<int>(feature_names_[i].size());
716
      writer->Write(&str_len, sizeof(int));
717
      const char* c_str = feature_names_[i].c_str();
718
      writer->Write(c_str, sizeof(char) * str_len);
719
    }
720
721
722
723
    // write forced bins
    for (int i = 0; i < num_total_features_; ++i) {
      int num_bounds = static_cast<int>(forced_bin_bounds_[i].size());
      writer->Write(&num_bounds, sizeof(int));
724

725
726
727
728
      for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) {
        writer->Write(&forced_bin_bounds_[i][j], sizeof(double));
      }
    }
729

Guolin Ke's avatar
Guolin Ke committed
730
731
    // get size of meta data
    size_t size_of_metadata = metadata_.SizesInByte();
732
    writer->Write(&size_of_metadata, sizeof(size_of_metadata));
Guolin Ke's avatar
Guolin Ke committed
733
    // write meta data
734
    metadata_.SaveBinaryToFile(writer.get());
Guolin Ke's avatar
Guolin Ke committed
735
736

    // write feature data
Guolin Ke's avatar
Guolin Ke committed
737
    for (int i = 0; i < num_groups_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
738
      // get size of feature
Guolin Ke's avatar
Guolin Ke committed
739
      size_t size_of_feature = feature_groups_[i]->SizesInByte();
740
      writer->Write(&size_of_feature, sizeof(size_of_feature));
Guolin Ke's avatar
Guolin Ke committed
741
      // write feature
742
      feature_groups_[i]->SaveBinaryToFile(writer.get());
Guolin Ke's avatar
Guolin Ke committed
743
744
745
746
    }
  }
}

747
void Dataset::DumpTextFile(const char* text_filename) {
Guolin Ke's avatar
Guolin Ke committed
748
749
750
751
752
753
  FILE* file = NULL;
#if _MSC_VER
  fopen_s(&file, text_filename, "wt");
#else
  file = fopen(text_filename, "wt");
#endif
754
755
756
757
758
  fprintf(file, "num_features: %d\n", num_features_);
  fprintf(file, "num_total_features: %d\n", num_total_features_);
  fprintf(file, "num_groups: %d\n", num_groups_);
  fprintf(file, "num_data: %d\n", num_data_);
  fprintf(file, "feature_names: ");
759
  for (auto n : feature_names_) {
760
761
762
    fprintf(file, "%s, ", n.c_str());
  }
  fprintf(file, "\nmonotone_constraints: ");
763
  for (auto i : monotone_types_) {
764
765
766
    fprintf(file, "%d, ", i);
  }
  fprintf(file, "\nfeature_penalty: ");
767
  for (auto i : feature_penalty_) {
768
769
    fprintf(file, "%lf, ", i);
  }
Belinda Trotta's avatar
Belinda Trotta committed
770
771
772
773
  fprintf(file, "\nmax_bin_by_feature: ");
  for (auto i : max_bin_by_feature_) {
    fprintf(file, "%d, ", i);
  }
774
  fprintf(file, "\n");
775
  for (auto n : feature_names_) {
776
777
    fprintf(file, "%s, ", n.c_str());
  }
778
779
780
781
782
783
784
  fprintf(file, "\nforced_bins: ");
  for (int i = 0; i < num_total_features_; ++i) {
    fprintf(file, "\nfeature %d: ", i);
    for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) {
      fprintf(file, "%lf, ", forced_bin_bounds_[i][j]);
    }
  }
785
786
  std::vector<std::unique_ptr<BinIterator>> iterators;
  iterators.reserve(num_features_);
787
  for (int j = 0; j < num_features_; ++j) {
788
789
790
791
    auto group_idx = feature2group_[j];
    auto sub_idx = feature2subfeature_[j];
    iterators.emplace_back(feature_groups_[group_idx]->SubFeatureIterator(sub_idx));
  }
792
  for (data_size_t i = 0; i < num_data_; ++i) {
793
    fprintf(file, "\n");
794
    for (int j = 0; j < num_total_features_; ++j) {
795
      auto inner_feature_idx = used_feature_map_[j];
796
797
      if (inner_feature_idx < 0) {
        fprintf(file, "NA, ");
798
      } else {
799
        fprintf(file, "%d, ", iterators[inner_feature_idx]->RawGet(i));
800
801
802
803
804
805
      }
    }
  }
  fclose(file);
}

806
807
808
void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
                                  const data_size_t* data_indices, data_size_t num_data,
                                  int leaf_idx,
Guolin Ke's avatar
Guolin Ke committed
809
                                  std::vector<std::unique_ptr<OrderedBin>>* ordered_bins,
810
811
                                  const score_t* gradients, const score_t* hessians,
                                  score_t* ordered_gradients, score_t* ordered_hessians,
812
813
                                  bool is_constant_hessian,
                                  HistogramBinEntry* hist_data) const {
zhangjin's avatar
zhangjin committed
814
  if (leaf_idx < 0 || num_data < 0 || hist_data == nullptr) {
Guolin Ke's avatar
Guolin Ke committed
815
816
    return;
  }
Guolin Ke's avatar
Guolin Ke committed
817
818
819
820
821

  std::vector<int> used_group;
  used_group.reserve(num_groups_);
  for (int group = 0; group < num_groups_; ++group) {
    const int f_cnt = group_feature_cnt_[group];
822
    bool is_group_used = false;
Guolin Ke's avatar
Guolin Ke committed
823
824
825
    for (int j = 0; j < f_cnt; ++j) {
      const int fidx = group_feature_start_[group] + j;
      if (is_feature_used[fidx]) {
826
        is_group_used = true;
Guolin Ke's avatar
Guolin Ke committed
827
828
829
        break;
      }
    }
830
831
832
    if (is_group_used) {
      used_group.push_back(group);
    }
Guolin Ke's avatar
Guolin Ke committed
833
834
  }
  int num_used_group = static_cast<int>(used_group.size());
Guolin Ke's avatar
Guolin Ke committed
835
836
837
  auto ptr_ordered_grad = gradients;
  auto ptr_ordered_hess = hessians;
  if (data_indices != nullptr && num_data < num_data_) {
838
839
840
841
842
843
844
845
846
847
848
    if (!is_constant_hessian) {
      #pragma omp parallel for schedule(static)
      for (data_size_t i = 0; i < num_data; ++i) {
        ordered_gradients[i] = gradients[data_indices[i]];
        ordered_hessians[i] = hessians[data_indices[i]];
      }
    } else {
      #pragma omp parallel for schedule(static)
      for (data_size_t i = 0; i < num_data; ++i) {
        ordered_gradients[i] = gradients[data_indices[i]];
      }
Guolin Ke's avatar
Guolin Ke committed
849
850
851
    }
    ptr_ordered_grad = ordered_gradients;
    ptr_ordered_hess = ordered_hessians;
852
853
854
    if (!is_constant_hessian) {
      OMP_INIT_EX();
      #pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
855
      for (int gi = 0; gi < num_used_group; ++gi) {
856
        OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
857
        int group = used_group[gi];
858
859
860
        // feature is not used
        auto data_ptr = hist_data + group_bin_boundaries_[group];
        const int num_bin = feature_groups_[group]->num_total_bin_;
Guolin Ke's avatar
Guolin Ke committed
861
        std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
862
        // construct histograms for smaller leaf
Guolin Ke's avatar
Guolin Ke committed
863
        if (ordered_bins->at(group) == nullptr) {
864
865
866
867
868
869
870
871
872
          // if not use ordered bin
          feature_groups_[group]->bin_data_->ConstructHistogram(
            data_indices,
            num_data,
            ptr_ordered_grad,
            ptr_ordered_hess,
            data_ptr);
        } else {
          // used ordered bin
Guolin Ke's avatar
Guolin Ke committed
873
          ordered_bins->at(group)->ConstructHistogram(leaf_idx,
874
875
876
                                                  gradients,
                                                  hessians,
                                                  data_ptr);
877
        }
878
        OMP_LOOP_EX_END();
879
      }
880
881
882
883
      OMP_THROW_EX();
    } else {
      OMP_INIT_EX();
      #pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
884
      for (int gi = 0; gi < num_used_group; ++gi) {
885
        OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
886
        int group = used_group[gi];
887
888
889
        // feature is not used
        auto data_ptr = hist_data + group_bin_boundaries_[group];
        const int num_bin = feature_groups_[group]->num_total_bin_;
Guolin Ke's avatar
Guolin Ke committed
890
        std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
891
        // construct histograms for smaller leaf
Guolin Ke's avatar
Guolin Ke committed
892
        if (ordered_bins->at(group) == nullptr) {
893
894
895
896
897
898
899
900
          // if not use ordered bin
          feature_groups_[group]->bin_data_->ConstructHistogram(
            data_indices,
            num_data,
            ptr_ordered_grad,
            data_ptr);
        } else {
          // used ordered bin
Guolin Ke's avatar
Guolin Ke committed
901
          ordered_bins->at(group)->ConstructHistogram(leaf_idx,
902
903
904
905
906
907
908
909
                                                  gradients,
                                                  data_ptr);
        }
        // fixed hessian.
        for (int i = 0; i < num_bin; ++i) {
          data_ptr[i].sum_hessians = data_ptr[i].cnt * hessians[0];
        }
        OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
910
      }
911
      OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
912
    }
913
  } else {
914
915
916
    if (!is_constant_hessian) {
      OMP_INIT_EX();
      #pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
917
      for (int gi = 0; gi < num_used_group; ++gi) {
918
        OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
919
        int group = used_group[gi];
920
921
922
        // feature is not used
        auto data_ptr = hist_data + group_bin_boundaries_[group];
        const int num_bin = feature_groups_[group]->num_total_bin_;
Guolin Ke's avatar
Guolin Ke committed
923
        std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
924
        // construct histograms for smaller leaf
Guolin Ke's avatar
Guolin Ke committed
925
        if (ordered_bins->at(group) == nullptr) {
926
927
928
929
930
931
932
933
          // if not use ordered bin
          feature_groups_[group]->bin_data_->ConstructHistogram(
            num_data,
            ptr_ordered_grad,
            ptr_ordered_hess,
            data_ptr);
        } else {
          // used ordered bin
Guolin Ke's avatar
Guolin Ke committed
934
          ordered_bins->at(group)->ConstructHistogram(leaf_idx,
935
936
937
938
939
                                                  gradients,
                                                  hessians,
                                                  data_ptr);
        }
        OMP_LOOP_EX_END();
940
      }
941
942
943
944
      OMP_THROW_EX();
    } else {
      OMP_INIT_EX();
      #pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
945
      for (int gi = 0; gi < num_used_group; ++gi) {
946
        OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
947
        int group = used_group[gi];
948
949
950
        // feature is not used
        auto data_ptr = hist_data + group_bin_boundaries_[group];
        const int num_bin = feature_groups_[group]->num_total_bin_;
Guolin Ke's avatar
Guolin Ke committed
951
        std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
952
        // construct histograms for smaller leaf
Guolin Ke's avatar
Guolin Ke committed
953
        if (ordered_bins->at(group) == nullptr) {
954
955
956
957
958
959
960
          // if not use ordered bin
          feature_groups_[group]->bin_data_->ConstructHistogram(
            num_data,
            ptr_ordered_grad,
            data_ptr);
        } else {
          // used ordered bin
Guolin Ke's avatar
Guolin Ke committed
961
          ordered_bins->at(group)->ConstructHistogram(leaf_idx,
962
963
964
965
966
967
968
969
                                                  gradients,
                                                  data_ptr);
        }
        // fixed hessian.
        for (int i = 0; i < num_bin; ++i) {
          data_ptr[i].sum_hessians = data_ptr[i].cnt * hessians[0];
        }
        OMP_LOOP_EX_END();
970
      }
971
      OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
972
973
974
975
976
    }
  }
}

void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data,
977
                           HistogramBinEntry* data) const {
Guolin Ke's avatar
Guolin Ke committed
978
979
980
981
982
983
  const int group = feature2group_[feature_idx];
  const int sub_feature = feature2subfeature_[feature_idx];
  const BinMapper* bin_mapper = feature_groups_[group]->bin_mappers_[sub_feature].get();
  const int default_bin = bin_mapper->GetDefaultBin();
  if (default_bin > 0) {
    const int num_bin = bin_mapper->num_bin();
984
985
986
    data[default_bin].sum_gradients = sum_gradient;
    data[default_bin].sum_hessians = sum_hessian;
    data[default_bin].cnt = num_data;
Guolin Ke's avatar
Guolin Ke committed
987
988
    for (int i = 0; i < num_bin; ++i) {
      if (i != default_bin) {
989
990
991
        data[default_bin].sum_gradients -= data[i].sum_gradients;
        data[default_bin].sum_hessians -= data[i].sum_hessians;
        data[default_bin].cnt -= data[i].cnt;
Guolin Ke's avatar
Guolin Ke committed
992
993
994
995
996
      }
    }
  }
}

997
template<typename T>
Guolin Ke's avatar
Guolin Ke committed
998
999
void PushVector(std::vector<T>* dest, const std::vector<T>& src) {
  dest->reserve(dest->size() + src.size());
1000
  for (auto i : src) {
Guolin Ke's avatar
Guolin Ke committed
1001
    dest->push_back(i);
1002
1003
1004
1005
  }
}

template<typename T>
Guolin Ke's avatar
Guolin Ke committed
1006
1007
void PushOffset(std::vector<T>* dest, const std::vector<T>& src, const T& offset) {
  dest->reserve(dest->size() + src.size());
1008
  for (auto i : src) {
Guolin Ke's avatar
Guolin Ke committed
1009
    dest->push_back(i + offset);
1010
1011
1012
1013
  }
}

template<typename T>
Guolin Ke's avatar
Guolin Ke committed
1014
1015
void PushClearIfEmpty(std::vector<T>* dest, const size_t dest_len, const std::vector<T>& src, const size_t src_len, const T& deflt) {
  if (!dest->empty() && !src.empty()) {
1016
    PushVector(dest, src);
Guolin Ke's avatar
Guolin Ke committed
1017
  } else if (!dest->empty() && src.empty()) {
1018
    for (size_t i = 0; i < src_len; ++i) {
Guolin Ke's avatar
Guolin Ke committed
1019
      dest->push_back(deflt);
1020
    }
Guolin Ke's avatar
Guolin Ke committed
1021
  } else if (dest->empty() && !src.empty()) {
1022
    for (size_t i = 0; i < dest_len; ++i) {
Guolin Ke's avatar
Guolin Ke committed
1023
      dest->push_back(deflt);
1024
1025
1026
1027
1028
    }
    PushVector(dest, src);
  }
}

1029
1030
void Dataset::addFeaturesFrom(Dataset* other) {
  if (other->num_data_ != num_data_) {
1031
1032
    throw std::runtime_error("Cannot add features from other Dataset with a different number of rows");
  }
Guolin Ke's avatar
Guolin Ke committed
1033
1034
1035
  PushVector(&feature_names_, other->feature_names_);
  PushVector(&feature2subfeature_, other->feature2subfeature_);
  PushVector(&group_feature_cnt_, other->group_feature_cnt_);
1036
  PushVector(&forced_bin_bounds_, other->forced_bin_bounds_);
1037
  feature_groups_.reserve(other->feature_groups_.size());
1038
  for (auto& fg : other->feature_groups_) {
1039
1040
    feature_groups_.emplace_back(new FeatureGroup(*fg));
  }
1041
1042
  for (auto feature_idx : other->used_feature_map_) {
    if (feature_idx >= 0) {
1043
1044
1045
1046
1047
      used_feature_map_.push_back(feature_idx + num_features_);
    } else {
      used_feature_map_.push_back(-1);  // Unused feature.
    }
  }
Guolin Ke's avatar
Guolin Ke committed
1048
1049
  PushOffset(&real_feature_idx_, other->real_feature_idx_, num_total_features_);
  PushOffset(&feature2group_, other->feature2group_, num_groups_);
1050
1051
  auto bin_offset = group_bin_boundaries_.back();
  // Skip the leading 0 when copying group_bin_boundaries.
1052
  for (auto i = other->group_bin_boundaries_.begin()+1; i < other->group_bin_boundaries_.end(); ++i) {
1053
1054
    group_bin_boundaries_.push_back(*i + bin_offset);
  }
Guolin Ke's avatar
Guolin Ke committed
1055
  PushOffset(&group_feature_start_, other->group_feature_start_, num_features_);
1056

Guolin Ke's avatar
Guolin Ke committed
1057
1058
  PushClearIfEmpty(&monotone_types_, num_total_features_, other->monotone_types_, other->num_total_features_, (int8_t)0);
  PushClearIfEmpty(&feature_penalty_, num_total_features_, other->feature_penalty_, other->num_total_features_, 1.0);
1059
  PushClearIfEmpty(&max_bin_by_feature_, num_total_features_, other->max_bin_by_feature_, other->num_total_features_, -1);
1060

1061
1062
1063
1064
1065
  num_features_ += other->num_features_;
  num_total_features_ += other->num_total_features_;
  num_groups_ += other->num_groups_;
}

Guolin Ke's avatar
Guolin Ke committed
1066
}  // namespace LightGBM