"vscode:/vscode.git/clone" did not exist on "a2e5f496eddd3efa0717fa39eb4af2d5152e81f0"
dataset.cpp 43 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
#include <LightGBM/dataset.h>
6

Guolin Ke's avatar
Guolin Ke committed
7
#include <LightGBM/feature_group.h>
8
#include <LightGBM/utils/array_args.h>
9
#include <LightGBM/utils/openmp_wrapper.h>
Guolin Ke's avatar
Guolin Ke committed
10
#include <LightGBM/utils/threading.h>
Guolin Ke's avatar
Guolin Ke committed
11

12
#include <limits>
zhangyafeikimi's avatar
zhangyafeikimi committed
13
#include <chrono>
Guolin Ke's avatar
Guolin Ke committed
14
#include <cstdio>
Guolin Ke's avatar
Guolin Ke committed
15
#include <sstream>
16
#include <unordered_map>
Guolin Ke's avatar
Guolin Ke committed
17

18

Guolin Ke's avatar
Guolin Ke committed
19
20
namespace LightGBM {

21
const char* Dataset::binary_file_token = "______LightGBM_Binary_File_Token______\n";
Guolin Ke's avatar
Guolin Ke committed
22

Guolin Ke's avatar
Guolin Ke committed
23
Dataset::Dataset() {
24
  data_filename_ = "noname";
Guolin Ke's avatar
Guolin Ke committed
25
  num_data_ = 0;
Guolin Ke's avatar
Guolin Ke committed
26
  is_finish_load_ = false;
Guolin Ke's avatar
Guolin Ke committed
27
28
}

29
Dataset::Dataset(data_size_t num_data) {
Guolin Ke's avatar
Guolin Ke committed
30
  CHECK(num_data > 0);
Guolin Ke's avatar
Guolin Ke committed
31
  data_filename_ = "noname";
Guolin Ke's avatar
Guolin Ke committed
32
  num_data_ = num_data;
Guolin Ke's avatar
Guolin Ke committed
33
  metadata_.Init(num_data_, NO_SPECIFIC, NO_SPECIFIC);
Guolin Ke's avatar
Guolin Ke committed
34
  is_finish_load_ = false;
Guolin Ke's avatar
Guolin Ke committed
35
  group_bin_boundaries_.push_back(0);
Guolin Ke's avatar
Guolin Ke committed
36
37
}

Guolin Ke's avatar
Guolin Ke committed
38
Dataset::~Dataset() {
Guolin Ke's avatar
Guolin Ke committed
39
}
Guolin Ke's avatar
Guolin Ke committed
40

Guolin Ke's avatar
Guolin Ke committed
41
42
43
44
45
46
47
48
49
50
std::vector<std::vector<int>> NoGroup(
  const std::vector<int>& used_features) {
  std::vector<std::vector<int>> features_in_group;
  features_in_group.resize(used_features.size());
  for (size_t i = 0; i < used_features.size(); ++i) {
    features_in_group[i].emplace_back(used_features[i]);
  }
  return features_in_group;
}

Guolin Ke's avatar
Guolin Ke committed
51
52
53
54
55
56
57
58
59
60
61
62
int GetConfilctCount(const std::vector<bool>& mark, const int* indices, int num_indices, int max_cnt) {
  int ret = 0;
  for (int i = 0; i < num_indices; ++i) {
    if (mark[indices[i]]) {
      ++ret;
      if (ret > max_cnt) {
        return -1;
      }
    }
  }
  return ret;
}
Guolin Ke's avatar
Guolin Ke committed
63
void MarkUsed(std::vector<bool>* mark, const int* indices, int num_indices) {
Guolin Ke's avatar
Guolin Ke committed
64
  auto& ref_mark = *mark;
Guolin Ke's avatar
Guolin Ke committed
65
  for (int i = 0; i < num_indices; ++i) {
Guolin Ke's avatar
Guolin Ke committed
66
    ref_mark[indices[i]] = true;
Guolin Ke's avatar
Guolin Ke committed
67
68
69
  }
}

Guolin Ke's avatar
Guolin Ke committed
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
std::vector<int> FixSampleIndices(const BinMapper* bin_mapper, int num_total_samples, int num_indices, const int* sample_indices, const double* sample_values) {
  std::vector<int> ret;
  if (bin_mapper->GetDefaultBin() == bin_mapper->GetMostFreqBin()) {
    return ret;
  }
  int i = 0, j = 0;
  while (i < num_total_samples) {
    if (j < num_indices && sample_indices[j] < i) {
      ++j;
    } else if (j < num_indices && sample_indices[j] == i) {
      if (bin_mapper->ValueToBin(sample_values[j]) != bin_mapper->GetMostFreqBin()) {
        ret.push_back(i);
      }
      ++i;
    } else {
      ret.push_back(i++);
    }
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
91
92
93
94
std::vector<std::vector<int>> FindGroups(const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
                                         const std::vector<int>& find_order,
                                         int** sample_indices,
                                         const int* num_per_col,
95
                                         int num_sample_col,
Guolin Ke's avatar
Guolin Ke committed
96
97
98
                                         size_t total_sample_cnt,
                                         data_size_t max_error_cnt,
                                         data_size_t filter_cnt,
Guolin Ke's avatar
Guolin Ke committed
99
100
                                         data_size_t num_data,
                                         bool is_use_gpu) {
Guolin Ke's avatar
Guolin Ke committed
101
  const int max_search_group = 100;
Guolin Ke's avatar
Guolin Ke committed
102
  const int gpu_max_bin_per_group = 256;
Guolin Ke's avatar
Guolin Ke committed
103
104
105
106
107
108
109
110
  Random rand(num_data);
  std::vector<std::vector<int>> features_in_group;
  std::vector<std::vector<bool>> conflict_marks;
  std::vector<int> group_conflict_cnt;
  std::vector<size_t> group_non_zero_cnt;
  std::vector<int> group_num_bin;

  for (auto fidx : find_order) {
111
112
    bool is_filtered_feature = fidx >= num_sample_col;
    const size_t cur_non_zero_cnt = is_filtered_feature ? 0: num_per_col[fidx];
Guolin Ke's avatar
Guolin Ke committed
113
114
115
    bool need_new_group = true;
    std::vector<int> available_groups;
    for (int gid = 0; gid < static_cast<int>(features_in_group.size()); ++gid) {
116
      if (group_non_zero_cnt[gid] + cur_non_zero_cnt <= total_sample_cnt + max_error_cnt) {
Guolin Ke's avatar
Guolin Ke committed
117
118
119
120
        if (!is_use_gpu || group_num_bin[gid] + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0)
            <= gpu_max_bin_per_group) {
          available_groups.push_back(gid);
        }
Guolin Ke's avatar
Guolin Ke committed
121
122
123
124
125
126
127
128
129
130
131
132
133
      }
    }
    std::vector<int> search_groups;
    if (!available_groups.empty()) {
      int last = static_cast<int>(available_groups.size()) - 1;
      auto indices = rand.Sample(last, std::min(last, max_search_group - 1));
      search_groups.push_back(available_groups.back());
      for (auto idx : indices) {
        search_groups.push_back(available_groups[idx]);
      }
    }
    for (auto gid : search_groups) {
      const int rest_max_cnt = max_error_cnt - group_conflict_cnt[gid];
134
      const int cnt = is_filtered_feature ? 0 : GetConfilctCount(conflict_marks[gid], sample_indices[fidx], num_per_col[fidx], rest_max_cnt);
Guolin Ke's avatar
Guolin Ke committed
135
136
137
138
139
140
141
142
      if (cnt >= 0 && cnt <= rest_max_cnt) {
        data_size_t rest_non_zero_data = static_cast<data_size_t>(
          static_cast<double>(cur_non_zero_cnt - cnt) * num_data / total_sample_cnt);
        if (rest_non_zero_data < filter_cnt) { continue; }
        need_new_group = false;
        features_in_group[gid].push_back(fidx);
        group_conflict_cnt[gid] += cnt;
        group_non_zero_cnt[gid] += cur_non_zero_cnt - cnt;
143
144
145
        if (!is_filtered_feature) {
          MarkUsed(&conflict_marks[gid], sample_indices[fidx], num_per_col[fidx]);
        }
Guolin Ke's avatar
Guolin Ke committed
146
147
148
        if (is_use_gpu) {
          group_num_bin[gid] += bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0);
        }
Guolin Ke's avatar
Guolin Ke committed
149
150
151
152
153
154
155
156
        break;
      }
    }
    if (need_new_group) {
      features_in_group.emplace_back();
      features_in_group.back().push_back(fidx);
      group_conflict_cnt.push_back(0);
      conflict_marks.emplace_back(total_sample_cnt, false);
157
158
159
      if (!is_filtered_feature) {
        MarkUsed(&(conflict_marks.back()), sample_indices[fidx], num_per_col[fidx]);
      }
Guolin Ke's avatar
Guolin Ke committed
160
      group_non_zero_cnt.emplace_back(cur_non_zero_cnt);
Guolin Ke's avatar
Guolin Ke committed
161
162
163
      if (is_use_gpu) {
        group_num_bin.push_back(1 + bin_mappers[fidx]->num_bin() + (bin_mappers[fidx]->GetDefaultBin() == 0 ? -1 : 0));
      }
Guolin Ke's avatar
Guolin Ke committed
164
165
166
167
168
    }
  }
  return features_in_group;
}

Guolin Ke's avatar
Guolin Ke committed
169
std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_ptr<BinMapper>>& bin_mappers,
Guolin Ke's avatar
Guolin Ke committed
170
                                                  int** sample_indices,
Guolin Ke's avatar
Guolin Ke committed
171
                                                  double** sample_values,
Guolin Ke's avatar
Guolin Ke committed
172
                                                  const int* num_per_col,
173
                                                  int num_sample_col,
Guolin Ke's avatar
Guolin Ke committed
174
175
176
177
178
179
                                                  size_t total_sample_cnt,
                                                  const std::vector<int>& used_features,
                                                  double max_conflict_rate,
                                                  data_size_t num_data,
                                                  data_size_t min_data,
                                                  double sparse_threshold,
Guolin Ke's avatar
Guolin Ke committed
180
181
                                                  bool is_enable_sparse,
                                                  bool is_use_gpu) {
Guolin Ke's avatar
Guolin Ke committed
182
183
184
185
  // filter is based on sampling data, so decrease its range
  const data_size_t filter_cnt = static_cast<data_size_t>(static_cast<double>(0.95 * min_data) / num_data * total_sample_cnt);
  const data_size_t max_error_cnt = static_cast<data_size_t>(total_sample_cnt * max_conflict_rate);
  std::vector<size_t> feature_non_zero_cnt;
186
  feature_non_zero_cnt.reserve(used_features.size());
Guolin Ke's avatar
Guolin Ke committed
187
188
  // put dense feature first
  for (auto fidx : used_features) {
189
190
191
192
193
    if (fidx < num_sample_col) {
      feature_non_zero_cnt.emplace_back(num_per_col[fidx]);
    } else {
      feature_non_zero_cnt.emplace_back(0);
    }
Guolin Ke's avatar
Guolin Ke committed
194
195
196
  }
  // sort by non zero cnt
  std::vector<int> sorted_idx;
197
  sorted_idx.reserve(used_features.size());
198
  for (int i = 0; i < static_cast<int>(used_features.size()); ++i) {
Guolin Ke's avatar
Guolin Ke committed
199
200
201
    sorted_idx.emplace_back(i);
  }
  // sort by non zero cnt, bigger first
202
203
  std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
                   [&feature_non_zero_cnt](int a, int b) {
Guolin Ke's avatar
Guolin Ke committed
204
205
206
207
    return feature_non_zero_cnt[a] > feature_non_zero_cnt[b];
  });

  std::vector<int> feature_order_by_cnt;
208
  feature_order_by_cnt.reserve(sorted_idx.size());
Guolin Ke's avatar
Guolin Ke committed
209
210
211
  for (auto sidx : sorted_idx) {
    feature_order_by_cnt.push_back(used_features[sidx]);
  }
Guolin Ke's avatar
Guolin Ke committed
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
  std::vector<std::vector<int>> tmp_indices;
  std::vector<int> tmp_num_per_col(num_sample_col, 0);
  for (auto fidx : used_features) {
    if (fidx >= num_sample_col) {
      continue;
    }
    auto ret = FixSampleIndices(bin_mappers[fidx].get(), static_cast<int>(total_sample_cnt), num_per_col[fidx], sample_indices[fidx], sample_values[fidx]);
    if (!ret.empty()) {
      tmp_indices.push_back(ret);
      tmp_num_per_col[fidx] = static_cast<int>(ret.size());
      sample_indices[fidx] = tmp_indices.back().data();
    } else {
      tmp_num_per_col[fidx] = num_per_col[fidx];
    }
  }
  auto features_in_group = FindGroups(bin_mappers, used_features, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, max_error_cnt, filter_cnt, num_data, is_use_gpu);
  auto group2 = FindGroups(bin_mappers, feature_order_by_cnt, sample_indices, tmp_num_per_col.data(), num_sample_col, total_sample_cnt, max_error_cnt, filter_cnt, num_data, is_use_gpu);
Guolin Ke's avatar
Guolin Ke committed
229
230
231
232
233
234
235
236
237
238
239
240
241
242
  if (features_in_group.size() > group2.size()) {
    features_in_group = group2;
  }
  std::vector<std::vector<int>> ret;
  for (size_t i = 0; i < features_in_group.size(); ++i) {
    if (features_in_group[i].size() <= 1 || features_in_group[i].size() >= 5) {
      ret.push_back(features_in_group[i]);
    } else {
      int cnt_non_zero = 0;
      for (size_t j = 0; j < features_in_group[i].size(); ++j) {
        const int fidx = features_in_group[i][j];
        cnt_non_zero += static_cast<int>(num_data * (1.0f - bin_mappers[fidx]->sparse_rate()));
      }
      double sparse_rate = 1.0f - static_cast<double>(cnt_non_zero) / (num_data);
243
      // take apart small sparse group, due it will not gain on speed
Guolin Ke's avatar
Guolin Ke committed
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
      if (sparse_rate >= sparse_threshold && is_enable_sparse) {
        for (size_t j = 0; j < features_in_group[i].size(); ++j) {
          const int fidx = features_in_group[i][j];
          ret.emplace_back();
          ret.back().push_back(fidx);
        }
      } else {
        ret.push_back(features_in_group[i]);
      }
    }
  }
  // shuffle groups
  int num_group = static_cast<int>(ret.size());
  Random tmp_rand(12);
  for (int i = 0; i < num_group - 1; ++i) {
    int j = tmp_rand.NextShort(i + 1, num_group);
    std::swap(ret[i], ret[j]);
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
265
void Dataset::Construct(
Guolin Ke's avatar
Guolin Ke committed
266
  std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
267
  int num_total_features,
268
  const std::vector<std::vector<double>>& forced_bins,
Guolin Ke's avatar
Guolin Ke committed
269
  int** sample_non_zero_indices,
Guolin Ke's avatar
Guolin Ke committed
270
  double** sample_values,
Guolin Ke's avatar
Guolin Ke committed
271
  const int* num_per_col,
272
  int num_sample_col,
Guolin Ke's avatar
Guolin Ke committed
273
  size_t total_sample_cnt,
Guolin Ke's avatar
Guolin Ke committed
274
  const Config& io_config) {
275
276
  num_total_features_ = num_total_features;
  CHECK(num_total_features_ == static_cast<int>(bin_mappers->size()));
277
  sparse_threshold_ = io_config.sparse_threshold;
Guolin Ke's avatar
Guolin Ke committed
278
279
  // get num_features
  std::vector<int> used_features;
Guolin Ke's avatar
Guolin Ke committed
280
  auto& ref_bin_mappers = *bin_mappers;
Guolin Ke's avatar
Guolin Ke committed
281
  for (int i = 0; i < static_cast<int>(bin_mappers->size()); ++i) {
Guolin Ke's avatar
Guolin Ke committed
282
    if (ref_bin_mappers[i] != nullptr && !ref_bin_mappers[i]->is_trivial()) {
Guolin Ke's avatar
Guolin Ke committed
283
      used_features.emplace_back(i);
Guolin Ke's avatar
Guolin Ke committed
284
    }
Guolin Ke's avatar
Guolin Ke committed
285
  }
Guolin Ke's avatar
Guolin Ke committed
286
  if (used_features.empty()) {
287
    Log::Warning("There are no meaningful features, as all feature values are constant.");
Guolin Ke's avatar
Guolin Ke committed
288
  }
Guolin Ke's avatar
Guolin Ke committed
289
290
  auto features_in_group = NoGroup(used_features);

291
  if (io_config.enable_bundle && !used_features.empty()) {
Guolin Ke's avatar
Guolin Ke committed
292
    features_in_group = FastFeatureBundling(*bin_mappers,
Guolin Ke's avatar
Guolin Ke committed
293
                                            sample_non_zero_indices, sample_values, num_per_col, num_sample_col, total_sample_cnt,
Guolin Ke's avatar
Guolin Ke committed
294
295
                                            used_features, io_config.max_conflict_rate,
                                            num_data_, io_config.min_data_in_leaf,
Guolin Ke's avatar
Guolin Ke committed
296
                                            sparse_threshold_, io_config.is_enable_sparse, io_config.device_type == std::string("gpu"));
Guolin Ke's avatar
Guolin Ke committed
297
298
  }

Guolin Ke's avatar
Guolin Ke committed
299
300
301
302
303
304
305
306
307
308
  num_features_ = 0;
  for (const auto& fs : features_in_group) {
    num_features_ += static_cast<int>(fs.size());
  }
  int cur_fidx = 0;
  used_feature_map_ = std::vector<int>(num_total_features_, -1);
  num_groups_ = static_cast<int>(features_in_group.size());
  real_feature_idx_.resize(num_features_);
  feature2group_.resize(num_features_);
  feature2subfeature_.resize(num_features_);
Guolin Ke's avatar
Guolin Ke committed
309
  feature_need_push_zeros_.clear();
Guolin Ke's avatar
Guolin Ke committed
310
311
312
313
314
315
316
317
318
319
320
  for (int i = 0; i < num_groups_; ++i) {
    auto cur_features = features_in_group[i];
    int cur_cnt_features = static_cast<int>(cur_features.size());
    // get bin_mappers
    std::vector<std::unique_ptr<BinMapper>> cur_bin_mappers;
    for (int j = 0; j < cur_cnt_features; ++j) {
      int real_fidx = cur_features[j];
      used_feature_map_[real_fidx] = cur_fidx;
      real_feature_idx_[cur_fidx] = real_fidx;
      feature2group_[cur_fidx] = i;
      feature2subfeature_[cur_fidx] = j;
Guolin Ke's avatar
Guolin Ke committed
321
      cur_bin_mappers.emplace_back(ref_bin_mappers[real_fidx].release());
Guolin Ke's avatar
Guolin Ke committed
322
323
324
      if (cur_bin_mappers.back()->GetDefaultBin() != cur_bin_mappers.back()->GetMostFreqBin()) {
        feature_need_push_zeros_.push_back(cur_fidx);
      }
Guolin Ke's avatar
Guolin Ke committed
325
326
327
      ++cur_fidx;
    }
    feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(
Guolin Ke's avatar
Guolin Ke committed
328
      new FeatureGroup(cur_cnt_features, &cur_bin_mappers, num_data_, sparse_threshold_,
Guolin Ke's avatar
Guolin Ke committed
329
                       io_config.is_enable_sparse)));
Guolin Ke's avatar
Guolin Ke committed
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
  }
  feature_groups_.shrink_to_fit();
  group_bin_boundaries_.clear();
  uint64_t num_total_bin = 0;
  group_bin_boundaries_.push_back(num_total_bin);
  for (int i = 0; i < num_groups_; ++i) {
    num_total_bin += feature_groups_[i]->num_total_bin_;
    group_bin_boundaries_.push_back(num_total_bin);
  }
  int last_group = 0;
  group_feature_start_.reserve(num_groups_);
  group_feature_cnt_.reserve(num_groups_);
  group_feature_start_.push_back(0);
  group_feature_cnt_.push_back(1);
  for (int i = 1; i < num_features_; ++i) {
    const int group = feature2group_[i];
    if (group == last_group) {
      group_feature_cnt_.back() = group_feature_cnt_.back() + 1;
    } else {
      group_feature_start_.push_back(i);
      group_feature_cnt_.push_back(1);
      last_group = group;
    }
  }
Guolin Ke's avatar
Guolin Ke committed
354
355
356
357
358
359
360
361
362
363
364
365
366
367

  if (!io_config.monotone_constraints.empty()) {
    CHECK(static_cast<size_t>(num_total_features_) == io_config.monotone_constraints.size());
    monotone_types_.resize(num_features_);
    for (int i = 0; i < num_total_features_; ++i) {
      int inner_fidx = InnerFeatureIndex(i);
      if (inner_fidx >= 0) {
        monotone_types_[inner_fidx] = io_config.monotone_constraints[i];
      }
    }
    if (ArrayArgs<int8_t>::CheckAllZero(monotone_types_)) {
      monotone_types_.clear();
    }
  }
Guolin Ke's avatar
Guolin Ke committed
368
369
370
371
372
373
374
375
376
377
378
379
380
  if (!io_config.feature_contri.empty()) {
    CHECK(static_cast<size_t>(num_total_features_) == io_config.feature_contri.size());
    feature_penalty_.resize(num_features_);
    for (int i = 0; i < num_total_features_; ++i) {
      int inner_fidx = InnerFeatureIndex(i);
      if (inner_fidx >= 0) {
        feature_penalty_[inner_fidx] = std::max(0.0, io_config.feature_contri[i]);
      }
    }
    if (ArrayArgs<double>::CheckAll(feature_penalty_, 1.0)) {
      feature_penalty_.clear();
    }
  }
Belinda Trotta's avatar
Belinda Trotta committed
381
382
383
384
385
386
  if (!io_config.max_bin_by_feature.empty()) {
    CHECK(static_cast<size_t>(num_total_features_) == io_config.max_bin_by_feature.size());
    CHECK(*(std::min_element(io_config.max_bin_by_feature.begin(), io_config.max_bin_by_feature.end())) > 1);
    max_bin_by_feature_.resize(num_total_features_);
    max_bin_by_feature_.assign(io_config.max_bin_by_feature.begin(), io_config.max_bin_by_feature.end());
  }
387
  forced_bin_bounds_ = forced_bins;
388
389
390
391
392
393
394
395
396
397
398
399
400
401
  max_bin_ = io_config.max_bin;
  min_data_in_bin_ = io_config.min_data_in_bin;
  bin_construct_sample_cnt_ = io_config.bin_construct_sample_cnt;
  use_missing_ = io_config.use_missing;
  zero_as_missing_ = io_config.zero_as_missing;
}

void Dataset::ResetConfig(const char* parameters) {
  auto param = Config::Str2Map(parameters);
  Config io_config;
  io_config.Set(param);
  if (param.count("max_bin") && io_config.max_bin != max_bin_) {
    Log::Warning("Cannot change max_bin after constructed Dataset handle.");
  }
Belinda Trotta's avatar
Belinda Trotta committed
402
403
404
  if (param.count("max_bin_by_feature") && io_config.max_bin_by_feature != max_bin_by_feature_) {
    Log::Warning("Cannot change max_bin_by_feature after constructed Dataset handle.");
  }
405
406
407
408
409
410
411
412
413
414
415
416
  if (param.count("bin_construct_sample_cnt") && io_config.bin_construct_sample_cnt != bin_construct_sample_cnt_) {
    Log::Warning("Cannot change bin_construct_sample_cnt after constructed Dataset handle.");
  }
  if (param.count("min_data_in_bin") && io_config.min_data_in_bin != min_data_in_bin_) {
    Log::Warning("Cannot change min_data_in_bin after constructed Dataset handle.");
  }
  if (param.count("use_missing") && io_config.use_missing != use_missing_) {
    Log::Warning("Cannot change use_missing after constructed Dataset handle.");
  }
  if (param.count("zero_as_missing") && io_config.zero_as_missing != zero_as_missing_) {
    Log::Warning("Cannot change zero_as_missing after constructed Dataset handle.");
  }
Guolin Ke's avatar
Guolin Ke committed
417
418
419
  if (param.count("sparse_threshold") && io_config.sparse_threshold != sparse_threshold_) {
    Log::Warning("Cannot change sparse_threshold after constructed Dataset handle.");
  }
420
421
422
  if (param.count("forcedbins_filename")) {
    Log::Warning("Cannot change forced bins after constructed Dataset handle.");
  }
Guolin Ke's avatar
Guolin Ke committed
423

424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
  if (!io_config.monotone_constraints.empty()) {
    CHECK(static_cast<size_t>(num_total_features_) == io_config.monotone_constraints.size());
    monotone_types_.resize(num_features_);
    for (int i = 0; i < num_total_features_; ++i) {
      int inner_fidx = InnerFeatureIndex(i);
      if (inner_fidx >= 0) {
        monotone_types_[inner_fidx] = io_config.monotone_constraints[i];
      }
    }
    if (ArrayArgs<int8_t>::CheckAllZero(monotone_types_)) {
      monotone_types_.clear();
    }
  }
  if (!io_config.feature_contri.empty()) {
    CHECK(static_cast<size_t>(num_total_features_) == io_config.feature_contri.size());
    feature_penalty_.resize(num_features_);
    for (int i = 0; i < num_total_features_; ++i) {
      int inner_fidx = InnerFeatureIndex(i);
      if (inner_fidx >= 0) {
        feature_penalty_[inner_fidx] = std::max(0.0, io_config.feature_contri[i]);
      }
    }
    if (ArrayArgs<double>::CheckAll(feature_penalty_, 1.0)) {
      feature_penalty_.clear();
    }
  }
Guolin Ke's avatar
Guolin Ke committed
450
451
}

Guolin Ke's avatar
Guolin Ke committed
452
void Dataset::FinishLoad() {
Guolin Ke's avatar
Guolin Ke committed
453
  if (is_finish_load_) { return; }
454
455
456
457
458
459
460
461
462
  if (num_groups_ > 0) {
    OMP_INIT_EX();
#pragma omp parallel for schedule(guided)
    for (int i = 0; i < num_groups_; ++i) {
      OMP_LOOP_EX_BEGIN();
      feature_groups_[i]->bin_data_->FinishLoad();
      OMP_LOOP_EX_END();
    }
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
463
  }
Guolin Ke's avatar
Guolin Ke committed
464
  is_finish_load_ = true;
Guolin Ke's avatar
Guolin Ke committed
465
}
Guolin Ke's avatar
Guolin Ke committed
466

467
void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
Guolin Ke's avatar
Guolin Ke committed
468
  feature_groups_.clear();
Guolin Ke's avatar
Guolin Ke committed
469
  num_features_ = dataset->num_features_;
Guolin Ke's avatar
Guolin Ke committed
470
  num_groups_ = dataset->num_groups_;
471
  sparse_threshold_ = dataset->sparse_threshold_;
Guolin Ke's avatar
Guolin Ke committed
472
  // copy feature bin mapper data
Guolin Ke's avatar
Guolin Ke committed
473
474
475
476
477
478
479
  for (int i = 0; i < num_groups_; ++i) {
    std::vector<std::unique_ptr<BinMapper>> bin_mappers;
    for (int j = 0; j < dataset->feature_groups_[i]->num_feature_; ++j) {
      bin_mappers.emplace_back(new BinMapper(*(dataset->feature_groups_[i]->bin_mappers_[j])));
    }
    feature_groups_.emplace_back(new FeatureGroup(
      dataset->feature_groups_[i]->num_feature_,
Guolin Ke's avatar
Guolin Ke committed
480
      &bin_mappers,
Guolin Ke's avatar
Guolin Ke committed
481
      num_data_,
Guolin Ke's avatar
Guolin Ke committed
482
      dataset->feature_groups_[i]->is_sparse_));
Guolin Ke's avatar
Guolin Ke committed
483
  }
Guolin Ke's avatar
Guolin Ke committed
484
  feature_groups_.shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
485
486
487
  used_feature_map_ = dataset->used_feature_map_;
  num_total_features_ = dataset->num_total_features_;
  feature_names_ = dataset->feature_names_;
Guolin Ke's avatar
Guolin Ke committed
488
  label_idx_ = dataset->label_idx_;
Guolin Ke's avatar
Guolin Ke committed
489
490
491
492
493
494
  real_feature_idx_ = dataset->real_feature_idx_;
  feature2group_ = dataset->feature2group_;
  feature2subfeature_ = dataset->feature2subfeature_;
  group_bin_boundaries_ = dataset->group_bin_boundaries_;
  group_feature_start_ = dataset->group_feature_start_;
  group_feature_cnt_ = dataset->group_feature_cnt_;
Guolin Ke's avatar
Guolin Ke committed
495
  monotone_types_ = dataset->monotone_types_;
Guolin Ke's avatar
Guolin Ke committed
496
  feature_penalty_ = dataset->feature_penalty_;
497
  forced_bin_bounds_ = dataset->forced_bin_bounds_;
Guolin Ke's avatar
Guolin Ke committed
498
  feature_need_push_zeros_ = dataset->feature_need_push_zeros_;
Guolin Ke's avatar
Guolin Ke committed
499
500
501
502
503
504
}

void Dataset::CreateValid(const Dataset* dataset) {
  feature_groups_.clear();
  num_features_ = dataset->num_features_;
  num_groups_ = num_features_;
505
  sparse_threshold_ = dataset->sparse_threshold_;
Guolin Ke's avatar
Guolin Ke committed
506
507
508
509
  bool is_enable_sparse = true;
  feature2group_.clear();
  feature2subfeature_.clear();
  // copy feature bin mapper data
Guolin Ke's avatar
Guolin Ke committed
510
  feature_need_push_zeros_.clear();
Guolin Ke's avatar
Guolin Ke committed
511
512
513
  for (int i = 0; i < num_features_; ++i) {
    std::vector<std::unique_ptr<BinMapper>> bin_mappers;
    bin_mappers.emplace_back(new BinMapper(*(dataset->FeatureBinMapper(i))));
Guolin Ke's avatar
Guolin Ke committed
514
515
516
    if (bin_mappers.back()->GetDefaultBin() != bin_mappers.back()->GetMostFreqBin()) {
      feature_need_push_zeros_.push_back(i);
    }
Guolin Ke's avatar
Guolin Ke committed
517
518
    feature_groups_.emplace_back(new FeatureGroup(
      1,
Guolin Ke's avatar
Guolin Ke committed
519
      &bin_mappers,
Guolin Ke's avatar
Guolin Ke committed
520
      num_data_,
Guolin Ke's avatar
Guolin Ke committed
521
      sparse_threshold_,
Guolin Ke's avatar
Guolin Ke committed
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
      is_enable_sparse));
    feature2group_.push_back(i);
    feature2subfeature_.push_back(0);
  }

  feature_groups_.shrink_to_fit();
  used_feature_map_ = dataset->used_feature_map_;
  num_total_features_ = dataset->num_total_features_;
  feature_names_ = dataset->feature_names_;
  label_idx_ = dataset->label_idx_;
  real_feature_idx_ = dataset->real_feature_idx_;
  group_bin_boundaries_.clear();
  uint64_t num_total_bin = 0;
  group_bin_boundaries_.push_back(num_total_bin);
  for (int i = 0; i < num_groups_; ++i) {
    num_total_bin += feature_groups_[i]->num_total_bin_;
    group_bin_boundaries_.push_back(num_total_bin);
  }
  int last_group = 0;
  group_feature_start_.reserve(num_groups_);
  group_feature_cnt_.reserve(num_groups_);
  group_feature_start_.push_back(0);
  group_feature_cnt_.push_back(1);
  for (int i = 1; i < num_features_; ++i) {
    const int group = feature2group_[i];
    if (group == last_group) {
      group_feature_cnt_.back() = group_feature_cnt_.back() + 1;
    } else {
      group_feature_start_.push_back(i);
      group_feature_cnt_.push_back(1);
      last_group = group;
    }
  }
Guolin Ke's avatar
Guolin Ke committed
555
  monotone_types_ = dataset->monotone_types_;
Guolin Ke's avatar
Guolin Ke committed
556
  feature_penalty_ = dataset->feature_penalty_;
557
  forced_bin_bounds_ = dataset->forced_bin_bounds_;
Guolin Ke's avatar
Guolin Ke committed
558
559
}

Guolin Ke's avatar
Guolin Ke committed
560
561
562
void Dataset::ReSize(data_size_t num_data) {
  if (num_data_ != num_data) {
    num_data_ = num_data;
563
    OMP_INIT_EX();
Guolin Ke's avatar
Guolin Ke committed
564
    #pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
565
    for (int group = 0; group < num_groups_; ++group) {
566
      OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
567
      feature_groups_[group]->bin_data_->ReSize(num_data_);
568
      OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
569
    }
570
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
571
572
573
574
575
  }
}

void Dataset::CopySubset(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data) {
  CHECK(num_used_indices == num_data_);
576
  OMP_INIT_EX();
Guolin Ke's avatar
Guolin Ke committed
577
  #pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
578
  for (int group = 0; group < num_groups_; ++group) {
579
    OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
580
    feature_groups_[group]->CopySubset(fullset->feature_groups_[group].get(), used_indices, num_used_indices);
581
    OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
582
  }
583
  OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
584
  if (need_meta_data) {
Guolin Ke's avatar
Guolin Ke committed
585
    metadata_.Init(fullset->metadata_, used_indices, num_used_indices);
Guolin Ke's avatar
Guolin Ke committed
586
  }
Guolin Ke's avatar
Guolin Ke committed
587
  is_finish_load_ = true;
Guolin Ke's avatar
Guolin Ke committed
588
589
}

590
bool Dataset::SetFloatField(const char* field_name, const float* field_data, data_size_t num_element) {
Guolin Ke's avatar
Guolin Ke committed
591
592
593
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("label") || name == std::string("target")) {
594
    #ifdef LABEL_T_USE_DOUBLE
595
    Log::Fatal("Don't support LABEL_T_USE_DOUBLE");
596
    #else
597
    metadata_.SetLabel(field_data, num_element);
598
    #endif
Guolin Ke's avatar
Guolin Ke committed
599
  } else if (name == std::string("weight") || name == std::string("weights")) {
600
    #ifdef LABEL_T_USE_DOUBLE
601
    Log::Fatal("Don't support LABEL_T_USE_DOUBLE");
602
    #else
603
    metadata_.SetWeights(field_data, num_element);
604
    #endif
Guolin Ke's avatar
Guolin Ke committed
605
606
607
608
609
610
611
612
613
614
  } else {
    return false;
  }
  return true;
}

bool Dataset::SetDoubleField(const char* field_name, const double* field_data, data_size_t num_element) {
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("init_score")) {
615
    metadata_.SetInitScore(field_data, num_element);
Guolin Ke's avatar
Guolin Ke committed
616
  } else {
617
    return false;
Guolin Ke's avatar
Guolin Ke committed
618
  }
619
  return true;
Guolin Ke's avatar
Guolin Ke committed
620
621
}

622
623
624
625
bool Dataset::SetIntField(const char* field_name, const int* field_data, data_size_t num_element) {
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("query") || name == std::string("group")) {
Guolin Ke's avatar
Guolin Ke committed
626
    metadata_.SetQuery(field_data, num_element);
627
628
629
630
631
632
  } else {
    return false;
  }
  return true;
}

Guolin Ke's avatar
Guolin Ke committed
633
bool Dataset::GetFloatField(const char* field_name, data_size_t* out_len, const float** out_ptr) {
634
635
636
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("label") || name == std::string("target")) {
637
    #ifdef LABEL_T_USE_DOUBLE
638
    Log::Fatal("Don't support LABEL_T_USE_DOUBLE");
639
    #else
640
641
    *out_ptr = metadata_.label();
    *out_len = num_data_;
642
    #endif
643
  } else if (name == std::string("weight") || name == std::string("weights")) {
644
    #ifdef LABEL_T_USE_DOUBLE
645
    Log::Fatal("Don't support LABEL_T_USE_DOUBLE");
646
    #else
647
648
    *out_ptr = metadata_.weights();
    *out_len = num_data_;
649
    #endif
Guolin Ke's avatar
Guolin Ke committed
650
651
652
653
654
655
656
657
658
659
  } else {
    return false;
  }
  return true;
}

bool Dataset::GetDoubleField(const char* field_name, data_size_t* out_len, const double** out_ptr) {
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("init_score")) {
660
    *out_ptr = metadata_.init_score();
Guolin Ke's avatar
Guolin Ke committed
661
    *out_len = static_cast<data_size_t>(metadata_.num_init_score());
662
  } else if (name == std::string("feature_penalty")) {
663
    *out_ptr = feature_penalty_.data();
Guolin Ke's avatar
Guolin Ke committed
664
    *out_len = static_cast<data_size_t>(feature_penalty_.size());
665
  } else {
666
667
    return false;
  }
668
  return true;
669
670
}

Guolin Ke's avatar
Guolin Ke committed
671
bool Dataset::GetIntField(const char* field_name, data_size_t* out_len, const int** out_ptr) {
672
673
674
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("query") || name == std::string("group")) {
675
    *out_ptr = metadata_.query_boundaries();
Guolin Ke's avatar
Guolin Ke committed
676
    *out_len = metadata_.num_queries() + 1;
Guolin Ke's avatar
Guolin Ke committed
677
678
679
  } else {
    return false;
  }
680
  return true;
681
682
}

683
684
685
686
687
bool Dataset::GetInt8Field(const char* field_name, data_size_t* out_len, const int8_t** out_ptr) {
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("monotone_constraints")) {
    *out_ptr = monotone_types_.data();
Guolin Ke's avatar
Guolin Ke committed
688
    *out_len = static_cast<data_size_t>(monotone_types_.size());
689
690
691
692
693
694
  } else {
    return false;
  }
  return true;
}

Guolin Ke's avatar
Guolin Ke committed
695
void Dataset::SaveBinaryFile(const char* bin_filename) {
Guolin Ke's avatar
Guolin Ke committed
696
  if (bin_filename != nullptr
Guolin Ke's avatar
Guolin Ke committed
697
      && std::string(bin_filename) == data_filename_) {
698
    Log::Warning("Bianry file %s already exists", bin_filename);
Guolin Ke's avatar
Guolin Ke committed
699
700
    return;
  }
Guolin Ke's avatar
Guolin Ke committed
701
  // if not pass a filename, just append ".bin" of original file
Guolin Ke's avatar
Guolin Ke committed
702
  std::string bin_filename_str(data_filename_);
Guolin Ke's avatar
Guolin Ke committed
703
704
705
706
  if (bin_filename == nullptr || bin_filename[0] == '\0') {
    bin_filename_str.append(".bin");
    bin_filename = bin_filename_str.c_str();
  }
Guolin Ke's avatar
Guolin Ke committed
707
  bool is_file_existed = false;
708
709

  if (VirtualFileWriter::Exists(bin_filename)) {
Guolin Ke's avatar
Guolin Ke committed
710
    is_file_existed = true;
711
    Log::Warning("File %s exists, cannot save binary to it", bin_filename);
Guolin Ke's avatar
Guolin Ke committed
712
  }
Guolin Ke's avatar
Guolin Ke committed
713

Guolin Ke's avatar
Guolin Ke committed
714
  if (!is_file_existed) {
715
716
    auto writer = VirtualFileWriter::Make(bin_filename);
    if (!writer->Init()) {
Guolin Ke's avatar
Guolin Ke committed
717
      Log::Fatal("Cannot write binary data to %s ", bin_filename);
Guolin Ke's avatar
Guolin Ke committed
718
    }
719
    Log::Info("Saving data to binary file %s", bin_filename);
720
    size_t size_of_token = std::strlen(binary_file_token);
721
    writer->Write(binary_file_token, size_of_token);
Guolin Ke's avatar
Guolin Ke committed
722
    // get size of header
Guolin Ke's avatar
Guolin Ke committed
723
    size_t size_of_header = sizeof(num_data_) + sizeof(num_features_) + sizeof(num_total_features_)
Guolin Ke's avatar
Guolin Ke committed
724
      + sizeof(int) * num_total_features_ + sizeof(label_idx_) + sizeof(num_groups_) + sizeof(sparse_threshold_)
Guolin Ke's avatar
Guolin Ke committed
725
      + 3 * sizeof(int) * num_features_ + sizeof(uint64_t) * (num_groups_ + 1) + 2 * sizeof(int) * num_groups_ + sizeof(int8_t) * num_features_
Belinda Trotta's avatar
Belinda Trotta committed
726
      + sizeof(double) * num_features_ + sizeof(int32_t) * num_total_features_ + sizeof(int) * 3 + sizeof(bool) * 2;
727
728
729
730
    // size of feature names
    for (int i = 0; i < num_total_features_; ++i) {
      size_of_header += feature_names_[i].size() + sizeof(int);
    }
731
732
733
734
    // size of forced bins
    for (int i = 0; i < num_total_features_; ++i) {
      size_of_header += forced_bin_bounds_[i].size() * sizeof(double) + sizeof(int);
    }
735
    writer->Write(&size_of_header, sizeof(size_of_header));
Guolin Ke's avatar
Guolin Ke committed
736
    // write header
737
738
739
740
    writer->Write(&num_data_, sizeof(num_data_));
    writer->Write(&num_features_, sizeof(num_features_));
    writer->Write(&num_total_features_, sizeof(num_total_features_));
    writer->Write(&label_idx_, sizeof(label_idx_));
741
742
743
744
745
    writer->Write(&max_bin_, sizeof(max_bin_));
    writer->Write(&bin_construct_sample_cnt_, sizeof(bin_construct_sample_cnt_));
    writer->Write(&min_data_in_bin_, sizeof(min_data_in_bin_));
    writer->Write(&use_missing_, sizeof(use_missing_));
    writer->Write(&zero_as_missing_, sizeof(zero_as_missing_));
Guolin Ke's avatar
Guolin Ke committed
746
    writer->Write(&sparse_threshold_, sizeof(sparse_threshold_));
747
748
749
750
751
752
753
754
    writer->Write(used_feature_map_.data(), sizeof(int) * num_total_features_);
    writer->Write(&num_groups_, sizeof(num_groups_));
    writer->Write(real_feature_idx_.data(), sizeof(int) * num_features_);
    writer->Write(feature2group_.data(), sizeof(int) * num_features_);
    writer->Write(feature2subfeature_.data(), sizeof(int) * num_features_);
    writer->Write(group_bin_boundaries_.data(), sizeof(uint64_t) * (num_groups_ + 1));
    writer->Write(group_feature_start_.data(), sizeof(int) * num_groups_);
    writer->Write(group_feature_cnt_.data(), sizeof(int) * num_groups_);
Guolin Ke's avatar
Guolin Ke committed
755
756
757
758
759
760
761
    if (monotone_types_.empty()) {
      ArrayArgs<int8_t>::Assign(&monotone_types_, 0, num_features_);
    }
    writer->Write(monotone_types_.data(), sizeof(int8_t) * num_features_);
    if (ArrayArgs<int8_t>::CheckAllZero(monotone_types_)) {
      monotone_types_.clear();
    }
Guolin Ke's avatar
Guolin Ke committed
762
763
764
765
766
767
768
    if (feature_penalty_.empty()) {
      ArrayArgs<double>::Assign(&feature_penalty_, 1.0, num_features_);
    }
    writer->Write(feature_penalty_.data(), sizeof(double) * num_features_);
    if (ArrayArgs<double>::CheckAll(feature_penalty_, 1.0)) {
      feature_penalty_.clear();
    }
Belinda Trotta's avatar
Belinda Trotta committed
769
770
771
772
773
774
775
    if (max_bin_by_feature_.empty()) {
      ArrayArgs<int32_t>::Assign(&max_bin_by_feature_, -1, num_total_features_);
    }
    writer->Write(max_bin_by_feature_.data(), sizeof(int32_t) * num_total_features_);
    if (ArrayArgs<int32_t>::CheckAll(max_bin_by_feature_, -1)) {
      max_bin_by_feature_.clear();
    }
776
777
778
    // write feature names
    for (int i = 0; i < num_total_features_; ++i) {
      int str_len = static_cast<int>(feature_names_[i].size());
779
      writer->Write(&str_len, sizeof(int));
780
      const char* c_str = feature_names_[i].c_str();
781
      writer->Write(c_str, sizeof(char) * str_len);
782
    }
783
784
785
786
    // write forced bins
    for (int i = 0; i < num_total_features_; ++i) {
      int num_bounds = static_cast<int>(forced_bin_bounds_[i].size());
      writer->Write(&num_bounds, sizeof(int));
787

788
789
790
791
      for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) {
        writer->Write(&forced_bin_bounds_[i][j], sizeof(double));
      }
    }
792

Guolin Ke's avatar
Guolin Ke committed
793
794
    // get size of meta data
    size_t size_of_metadata = metadata_.SizesInByte();
795
    writer->Write(&size_of_metadata, sizeof(size_of_metadata));
Guolin Ke's avatar
Guolin Ke committed
796
    // write meta data
797
    metadata_.SaveBinaryToFile(writer.get());
Guolin Ke's avatar
Guolin Ke committed
798
799

    // write feature data
Guolin Ke's avatar
Guolin Ke committed
800
    for (int i = 0; i < num_groups_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
801
      // get size of feature
Guolin Ke's avatar
Guolin Ke committed
802
      size_t size_of_feature = feature_groups_[i]->SizesInByte();
803
      writer->Write(&size_of_feature, sizeof(size_of_feature));
Guolin Ke's avatar
Guolin Ke committed
804
      // write feature
805
      feature_groups_[i]->SaveBinaryToFile(writer.get());
Guolin Ke's avatar
Guolin Ke committed
806
807
808
809
    }
  }
}

810
void Dataset::DumpTextFile(const char* text_filename) {
Guolin Ke's avatar
Guolin Ke committed
811
812
813
814
815
816
  FILE* file = NULL;
#if _MSC_VER
  fopen_s(&file, text_filename, "wt");
#else
  file = fopen(text_filename, "wt");
#endif
817
818
819
820
821
  fprintf(file, "num_features: %d\n", num_features_);
  fprintf(file, "num_total_features: %d\n", num_total_features_);
  fprintf(file, "num_groups: %d\n", num_groups_);
  fprintf(file, "num_data: %d\n", num_data_);
  fprintf(file, "feature_names: ");
822
  for (auto n : feature_names_) {
823
824
825
    fprintf(file, "%s, ", n.c_str());
  }
  fprintf(file, "\nmonotone_constraints: ");
826
  for (auto i : monotone_types_) {
827
828
829
    fprintf(file, "%d, ", i);
  }
  fprintf(file, "\nfeature_penalty: ");
830
  for (auto i : feature_penalty_) {
831
832
    fprintf(file, "%lf, ", i);
  }
Belinda Trotta's avatar
Belinda Trotta committed
833
834
835
836
  fprintf(file, "\nmax_bin_by_feature: ");
  for (auto i : max_bin_by_feature_) {
    fprintf(file, "%d, ", i);
  }
837
  fprintf(file, "\n");
838
  for (auto n : feature_names_) {
839
840
    fprintf(file, "%s, ", n.c_str());
  }
841
842
843
844
845
846
847
  fprintf(file, "\nforced_bins: ");
  for (int i = 0; i < num_total_features_; ++i) {
    fprintf(file, "\nfeature %d: ", i);
    for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) {
      fprintf(file, "%lf, ", forced_bin_bounds_[i][j]);
    }
  }
848
849
  std::vector<std::unique_ptr<BinIterator>> iterators;
  iterators.reserve(num_features_);
850
  for (int j = 0; j < num_features_; ++j) {
851
852
853
854
    auto group_idx = feature2group_[j];
    auto sub_idx = feature2subfeature_[j];
    iterators.emplace_back(feature_groups_[group_idx]->SubFeatureIterator(sub_idx));
  }
855
  for (data_size_t i = 0; i < num_data_; ++i) {
856
    fprintf(file, "\n");
857
    for (int j = 0; j < num_total_features_; ++j) {
858
      auto inner_feature_idx = used_feature_map_[j];
859
860
      if (inner_feature_idx < 0) {
        fprintf(file, "NA, ");
861
      } else {
Guolin Ke's avatar
Guolin Ke committed
862
        fprintf(file, "%d, ", iterators[inner_feature_idx]->Get(i));
863
864
865
866
867
868
      }
    }
  }
  fclose(file);
}

869
870
871
void Dataset::ConstructHistograms(const std::vector<int8_t>& is_feature_used,
                                  const data_size_t* data_indices, data_size_t num_data,
                                  int leaf_idx,
Guolin Ke's avatar
Guolin Ke committed
872
                                  std::vector<std::unique_ptr<OrderedBin>>* ordered_bins,
873
874
                                  const score_t* gradients, const score_t* hessians,
                                  score_t* ordered_gradients, score_t* ordered_hessians,
875
876
                                  bool is_constant_hessian,
                                  HistogramBinEntry* hist_data) const {
zhangjin's avatar
zhangjin committed
877
  if (leaf_idx < 0 || num_data < 0 || hist_data == nullptr) {
Guolin Ke's avatar
Guolin Ke committed
878
879
    return;
  }
Guolin Ke's avatar
Guolin Ke committed
880
881
882
883
884

  std::vector<int> used_group;
  used_group.reserve(num_groups_);
  for (int group = 0; group < num_groups_; ++group) {
    const int f_cnt = group_feature_cnt_[group];
885
    bool is_group_used = false;
Guolin Ke's avatar
Guolin Ke committed
886
887
888
    for (int j = 0; j < f_cnt; ++j) {
      const int fidx = group_feature_start_[group] + j;
      if (is_feature_used[fidx]) {
889
        is_group_used = true;
Guolin Ke's avatar
Guolin Ke committed
890
891
892
        break;
      }
    }
893
894
895
    if (is_group_used) {
      used_group.push_back(group);
    }
Guolin Ke's avatar
Guolin Ke committed
896
897
  }
  int num_used_group = static_cast<int>(used_group.size());
Guolin Ke's avatar
Guolin Ke committed
898
899
  auto ptr_ordered_grad = gradients;
  auto ptr_ordered_hess = hessians;
Guolin Ke's avatar
Guolin Ke committed
900
  auto& ref_ordered_bins = *ordered_bins;
Guolin Ke's avatar
Guolin Ke committed
901
  if (data_indices != nullptr && num_data < num_data_) {
902
903
904
905
906
907
908
909
910
911
912
    if (!is_constant_hessian) {
      #pragma omp parallel for schedule(static)
      for (data_size_t i = 0; i < num_data; ++i) {
        ordered_gradients[i] = gradients[data_indices[i]];
        ordered_hessians[i] = hessians[data_indices[i]];
      }
    } else {
      #pragma omp parallel for schedule(static)
      for (data_size_t i = 0; i < num_data; ++i) {
        ordered_gradients[i] = gradients[data_indices[i]];
      }
Guolin Ke's avatar
Guolin Ke committed
913
914
915
    }
    ptr_ordered_grad = ordered_gradients;
    ptr_ordered_hess = ordered_hessians;
916
917
918
    if (!is_constant_hessian) {
      OMP_INIT_EX();
      #pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
919
      for (int gi = 0; gi < num_used_group; ++gi) {
920
        OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
921
        int group = used_group[gi];
922
923
924
        // feature is not used
        auto data_ptr = hist_data + group_bin_boundaries_[group];
        const int num_bin = feature_groups_[group]->num_total_bin_;
Guolin Ke's avatar
Guolin Ke committed
925
        std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
926
        // construct histograms for smaller leaf
Guolin Ke's avatar
Guolin Ke committed
927
        if (ref_ordered_bins[group] == nullptr) {
928
929
930
          // if not use ordered bin
          feature_groups_[group]->bin_data_->ConstructHistogram(
            data_indices,
931
            0,
932
933
934
935
936
937
            num_data,
            ptr_ordered_grad,
            ptr_ordered_hess,
            data_ptr);
        } else {
          // used ordered bin
Guolin Ke's avatar
Guolin Ke committed
938
939
940
941
          ref_ordered_bins[group]->ConstructHistogram(leaf_idx,
                                                      gradients,
                                                      hessians,
                                                      data_ptr);
942
        }
943
        OMP_LOOP_EX_END();
944
      }
945
946
947
948
      OMP_THROW_EX();
    } else {
      OMP_INIT_EX();
      #pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
949
      for (int gi = 0; gi < num_used_group; ++gi) {
950
        OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
951
        int group = used_group[gi];
952
953
954
        // feature is not used
        auto data_ptr = hist_data + group_bin_boundaries_[group];
        const int num_bin = feature_groups_[group]->num_total_bin_;
Guolin Ke's avatar
Guolin Ke committed
955
        std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
956
        // construct histograms for smaller leaf
Guolin Ke's avatar
Guolin Ke committed
957
        if (ref_ordered_bins[group] == nullptr) {
958
959
960
          // if not use ordered bin
          feature_groups_[group]->bin_data_->ConstructHistogram(
            data_indices,
961
            0,
962
963
964
965
966
            num_data,
            ptr_ordered_grad,
            data_ptr);
        } else {
          // used ordered bin
Guolin Ke's avatar
Guolin Ke committed
967
968
969
          ref_ordered_bins[group]->ConstructHistogram(leaf_idx,
                                                      gradients,
                                                      data_ptr);
970
971
972
973
974
975
        }
        // fixed hessian.
        for (int i = 0; i < num_bin; ++i) {
          data_ptr[i].sum_hessians = data_ptr[i].cnt * hessians[0];
        }
        OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
976
      }
977
      OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
978
    }
979
  } else {
980
981
982
    if (!is_constant_hessian) {
      OMP_INIT_EX();
      #pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
983
      for (int gi = 0; gi < num_used_group; ++gi) {
984
        OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
985
        int group = used_group[gi];
986
987
988
        // feature is not used
        auto data_ptr = hist_data + group_bin_boundaries_[group];
        const int num_bin = feature_groups_[group]->num_total_bin_;
Guolin Ke's avatar
Guolin Ke committed
989
        std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
990
        // construct histograms for smaller leaf
Guolin Ke's avatar
Guolin Ke committed
991
        if (ref_ordered_bins[group] == nullptr) {
992
993
          // if not use ordered bin
          feature_groups_[group]->bin_data_->ConstructHistogram(
994
            0,
995
996
997
998
999
1000
            num_data,
            ptr_ordered_grad,
            ptr_ordered_hess,
            data_ptr);
        } else {
          // used ordered bin
Guolin Ke's avatar
Guolin Ke committed
1001
1002
1003
1004
          ref_ordered_bins[group]->ConstructHistogram(leaf_idx,
                                                      gradients,
                                                      hessians,
                                                      data_ptr);
1005
1006
        }
        OMP_LOOP_EX_END();
1007
      }
1008
1009
1010
1011
      OMP_THROW_EX();
    } else {
      OMP_INIT_EX();
      #pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
1012
      for (int gi = 0; gi < num_used_group; ++gi) {
1013
        OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
1014
        int group = used_group[gi];
1015
1016
1017
        // feature is not used
        auto data_ptr = hist_data + group_bin_boundaries_[group];
        const int num_bin = feature_groups_[group]->num_total_bin_;
Guolin Ke's avatar
Guolin Ke committed
1018
        std::memset(reinterpret_cast<void*>(data_ptr + 1), 0, (num_bin - 1) * sizeof(HistogramBinEntry));
1019
        // construct histograms for smaller leaf
Guolin Ke's avatar
Guolin Ke committed
1020
        if (ref_ordered_bins[group] == nullptr) {
1021
1022
          // if not use ordered bin
          feature_groups_[group]->bin_data_->ConstructHistogram(
1023
            0,
1024
1025
1026
1027
1028
            num_data,
            ptr_ordered_grad,
            data_ptr);
        } else {
          // used ordered bin
Guolin Ke's avatar
Guolin Ke committed
1029
1030
1031
          ref_ordered_bins[group]->ConstructHistogram(leaf_idx,
                                                      gradients,
                                                      data_ptr);
1032
1033
1034
1035
1036
1037
        }
        // fixed hessian.
        for (int i = 0; i < num_bin; ++i) {
          data_ptr[i].sum_hessians = data_ptr[i].cnt * hessians[0];
        }
        OMP_LOOP_EX_END();
1038
      }
1039
      OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
1040
1041
1042
1043
1044
    }
  }
}

void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, data_size_t num_data,
1045
                           HistogramBinEntry* data) const {
Guolin Ke's avatar
Guolin Ke committed
1046
1047
1048
  const int group = feature2group_[feature_idx];
  const int sub_feature = feature2subfeature_[feature_idx];
  const BinMapper* bin_mapper = feature_groups_[group]->bin_mappers_[sub_feature].get();
Guolin Ke's avatar
Guolin Ke committed
1049
1050
  const int most_freq_bin = bin_mapper->GetMostFreqBin();
  if (most_freq_bin > 0) {
Guolin Ke's avatar
Guolin Ke committed
1051
    const int num_bin = bin_mapper->num_bin();
Guolin Ke's avatar
Guolin Ke committed
1052
1053
1054
    data[most_freq_bin].sum_gradients = sum_gradient;
    data[most_freq_bin].sum_hessians = sum_hessian;
    data[most_freq_bin].cnt = num_data;
Guolin Ke's avatar
Guolin Ke committed
1055
    for (int i = 0; i < num_bin; ++i) {
Guolin Ke's avatar
Guolin Ke committed
1056
1057
1058
1059
      if (i != most_freq_bin) {
        data[most_freq_bin].sum_gradients -= data[i].sum_gradients;
        data[most_freq_bin].sum_hessians -= data[i].sum_hessians;
        data[most_freq_bin].cnt -= data[i].cnt;
Guolin Ke's avatar
Guolin Ke committed
1060
1061
1062
1063
1064
      }
    }
  }
}

1065
template<typename T>
Guolin Ke's avatar
Guolin Ke committed
1066
1067
void PushVector(std::vector<T>* dest, const std::vector<T>& src) {
  dest->reserve(dest->size() + src.size());
1068
  for (auto i : src) {
Guolin Ke's avatar
Guolin Ke committed
1069
    dest->push_back(i);
1070
1071
1072
1073
  }
}

template<typename T>
Guolin Ke's avatar
Guolin Ke committed
1074
1075
void PushOffset(std::vector<T>* dest, const std::vector<T>& src, const T& offset) {
  dest->reserve(dest->size() + src.size());
1076
  for (auto i : src) {
Guolin Ke's avatar
Guolin Ke committed
1077
    dest->push_back(i + offset);
1078
1079
1080
1081
  }
}

template<typename T>
Guolin Ke's avatar
Guolin Ke committed
1082
1083
void PushClearIfEmpty(std::vector<T>* dest, const size_t dest_len, const std::vector<T>& src, const size_t src_len, const T& deflt) {
  if (!dest->empty() && !src.empty()) {
1084
    PushVector(dest, src);
Guolin Ke's avatar
Guolin Ke committed
1085
  } else if (!dest->empty() && src.empty()) {
1086
    for (size_t i = 0; i < src_len; ++i) {
Guolin Ke's avatar
Guolin Ke committed
1087
      dest->push_back(deflt);
1088
    }
Guolin Ke's avatar
Guolin Ke committed
1089
  } else if (dest->empty() && !src.empty()) {
1090
    for (size_t i = 0; i < dest_len; ++i) {
Guolin Ke's avatar
Guolin Ke committed
1091
      dest->push_back(deflt);
1092
1093
1094
1095
1096
    }
    PushVector(dest, src);
  }
}

1097
1098
void Dataset::addFeaturesFrom(Dataset* other) {
  if (other->num_data_ != num_data_) {
1099
1100
    throw std::runtime_error("Cannot add features from other Dataset with a different number of rows");
  }
Guolin Ke's avatar
Guolin Ke committed
1101
1102
1103
  PushVector(&feature_names_, other->feature_names_);
  PushVector(&feature2subfeature_, other->feature2subfeature_);
  PushVector(&group_feature_cnt_, other->group_feature_cnt_);
1104
  PushVector(&forced_bin_bounds_, other->forced_bin_bounds_);
1105
  feature_groups_.reserve(other->feature_groups_.size());
1106
  for (auto& fg : other->feature_groups_) {
1107
1108
    feature_groups_.emplace_back(new FeatureGroup(*fg));
  }
1109
1110
  for (auto feature_idx : other->used_feature_map_) {
    if (feature_idx >= 0) {
1111
1112
1113
1114
1115
      used_feature_map_.push_back(feature_idx + num_features_);
    } else {
      used_feature_map_.push_back(-1);  // Unused feature.
    }
  }
Guolin Ke's avatar
Guolin Ke committed
1116
1117
  PushOffset(&real_feature_idx_, other->real_feature_idx_, num_total_features_);
  PushOffset(&feature2group_, other->feature2group_, num_groups_);
1118
1119
  auto bin_offset = group_bin_boundaries_.back();
  // Skip the leading 0 when copying group_bin_boundaries.
1120
  for (auto i = other->group_bin_boundaries_.begin()+1; i < other->group_bin_boundaries_.end(); ++i) {
1121
1122
    group_bin_boundaries_.push_back(*i + bin_offset);
  }
Guolin Ke's avatar
Guolin Ke committed
1123
  PushOffset(&group_feature_start_, other->group_feature_start_, num_features_);
1124

Guolin Ke's avatar
Guolin Ke committed
1125
1126
  PushClearIfEmpty(&monotone_types_, num_total_features_, other->monotone_types_, other->num_total_features_, (int8_t)0);
  PushClearIfEmpty(&feature_penalty_, num_total_features_, other->feature_penalty_, other->num_total_features_, 1.0);
1127
  PushClearIfEmpty(&max_bin_by_feature_, num_total_features_, other->max_bin_by_feature_, other->num_total_features_, -1);
1128

1129
1130
1131
1132
1133
  num_features_ += other->num_features_;
  num_total_features_ += other->num_total_features_;
  num_groups_ += other->num_groups_;
}

Guolin Ke's avatar
Guolin Ke committed
1134
}  // namespace LightGBM