dataset_loader.cpp 53.2 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
#include <LightGBM/dataset_loader.h>

7
#include <LightGBM/json11.hpp>
Guolin Ke's avatar
Guolin Ke committed
8
#include <LightGBM/network.h>
9
10
11
#include <LightGBM/utils/array_args.h>
#include <LightGBM/utils/log.h>
#include <LightGBM/utils/openmp_wrapper.h>
Guolin Ke's avatar
Guolin Ke committed
12

13
14
15
16
#include <fstream>

using namespace json11;

Guolin Ke's avatar
Guolin Ke committed
17
18
namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
19
20
DatasetLoader::DatasetLoader(const Config& io_config, const PredictFunction& predict_fun, int num_class, const char* filename)
  :config_(io_config), random_(config_.data_random_seed), predict_fun_(predict_fun), num_class_(num_class) {
Guolin Ke's avatar
Guolin Ke committed
21
22
23
24
  label_idx_ = 0;
  weight_idx_ = NO_SPECIFIC;
  group_idx_ = NO_SPECIFIC;
  SetHeader(filename);
Guolin Ke's avatar
Guolin Ke committed
25
26
27
28
29
}

DatasetLoader::~DatasetLoader() {
}

Guolin Ke's avatar
Guolin Ke committed
30
void DatasetLoader::SetHeader(const char* filename) {
Guolin Ke's avatar
Guolin Ke committed
31
  std::unordered_map<std::string, int> name2idx;
Guolin Ke's avatar
Guolin Ke committed
32
33
  std::string name_prefix("name:");
  if (filename != nullptr) {
Guolin Ke's avatar
Guolin Ke committed
34
    TextReader<data_size_t> text_reader(filename, config_.header);
Guolin Ke's avatar
Guolin Ke committed
35

Guolin Ke's avatar
Guolin Ke committed
36
    // get column names
Guolin Ke's avatar
Guolin Ke committed
37
    if (config_.header) {
Guolin Ke's avatar
Guolin Ke committed
38
      std::string first_line = text_reader.first_line();
39
      feature_names_ = Common::Split(first_line.c_str(), "\t,");
Guolin Ke's avatar
Guolin Ke committed
40
41
    }

Guolin Ke's avatar
Guolin Ke committed
42
    // load label idx first
Guolin Ke's avatar
Guolin Ke committed
43
44
45
    if (config_.label_column.size() > 0) {
      if (Common::StartsWith(config_.label_column, name_prefix)) {
        std::string name = config_.label_column.substr(name_prefix.size());
Guolin Ke's avatar
Guolin Ke committed
46
47
48
49
50
51
52
53
54
55
        label_idx_ = -1;
        for (int i = 0; i < static_cast<int>(feature_names_.size()); ++i) {
          if (name == feature_names_[i]) {
            label_idx_ = i;
            break;
          }
        }
        if (label_idx_ >= 0) {
          Log::Info("Using column %s as label", name.c_str());
        } else {
56
57
          Log::Fatal("Could not find label column %s in data file \n"
                     "or data file doesn't contain header", name.c_str());
Guolin Ke's avatar
Guolin Ke committed
58
        }
Guolin Ke's avatar
Guolin Ke committed
59
      } else {
Guolin Ke's avatar
Guolin Ke committed
60
        if (!Common::AtoiAndCheck(config_.label_column.c_str(), &label_idx_)) {
61
62
63
          Log::Fatal("label_column is not a number,\n"
                     "if you want to use a column name,\n"
                     "please add the prefix \"name:\" to the column name");
Guolin Ke's avatar
Guolin Ke committed
64
65
        }
        Log::Info("Using column number %d as label", label_idx_);
Guolin Ke's avatar
Guolin Ke committed
66
67
      }
    }
Guolin Ke's avatar
Guolin Ke committed
68

Guolin Ke's avatar
Guolin Ke committed
69
    if (!feature_names_.empty()) {
Guolin Ke's avatar
Guolin Ke committed
70
71
72
73
      // erase label column name
      feature_names_.erase(feature_names_.begin() + label_idx_);
      for (size_t i = 0; i < feature_names_.size(); ++i) {
        name2idx[feature_names_[i]] = static_cast<int>(i);
Guolin Ke's avatar
Guolin Ke committed
74
      }
Guolin Ke's avatar
Guolin Ke committed
75
76
77
    }

    // load ignore columns
Guolin Ke's avatar
Guolin Ke committed
78
79
80
    if (config_.ignore_column.size() > 0) {
      if (Common::StartsWith(config_.ignore_column, name_prefix)) {
        std::string names = config_.ignore_column.substr(name_prefix.size());
Guolin Ke's avatar
Guolin Ke committed
81
82
83
84
85
86
87
88
89
        for (auto name : Common::Split(names.c_str(), ',')) {
          if (name2idx.count(name) > 0) {
            int tmp = name2idx[name];
            ignore_features_.emplace(tmp);
          } else {
            Log::Fatal("Could not find ignore column %s in data file", name.c_str());
          }
        }
      } else {
Guolin Ke's avatar
Guolin Ke committed
90
        for (auto token : Common::Split(config_.ignore_column.c_str(), ',')) {
Guolin Ke's avatar
Guolin Ke committed
91
92
          int tmp = 0;
          if (!Common::AtoiAndCheck(token.c_str(), &tmp)) {
93
94
95
            Log::Fatal("ignore_column is not a number,\n"
                       "if you want to use a column name,\n"
                       "please add the prefix \"name:\" to the column name");
Guolin Ke's avatar
Guolin Ke committed
96
97
          }
          ignore_features_.emplace(tmp);
Guolin Ke's avatar
Guolin Ke committed
98
99
100
        }
      }
    }
Guolin Ke's avatar
Guolin Ke committed
101
    // load weight idx
Guolin Ke's avatar
Guolin Ke committed
102
103
104
    if (config_.weight_column.size() > 0) {
      if (Common::StartsWith(config_.weight_column, name_prefix)) {
        std::string name = config_.weight_column.substr(name_prefix.size());
Guolin Ke's avatar
Guolin Ke committed
105
106
107
108
109
110
        if (name2idx.count(name) > 0) {
          weight_idx_ = name2idx[name];
          Log::Info("Using column %s as weight", name.c_str());
        } else {
          Log::Fatal("Could not find weight column %s in data file", name.c_str());
        }
Guolin Ke's avatar
Guolin Ke committed
111
      } else {
Guolin Ke's avatar
Guolin Ke committed
112
        if (!Common::AtoiAndCheck(config_.weight_column.c_str(), &weight_idx_)) {
113
114
115
          Log::Fatal("weight_column is not a number,\n"
                     "if you want to use a column name,\n"
                     "please add the prefix \"name:\" to the column name");
Guolin Ke's avatar
Guolin Ke committed
116
117
        }
        Log::Info("Using column number %d as weight", weight_idx_);
Guolin Ke's avatar
Guolin Ke committed
118
      }
Guolin Ke's avatar
Guolin Ke committed
119
      ignore_features_.emplace(weight_idx_);
Guolin Ke's avatar
Guolin Ke committed
120
    }
Guolin Ke's avatar
Guolin Ke committed
121
    // load group idx
Guolin Ke's avatar
Guolin Ke committed
122
123
124
    if (config_.group_column.size() > 0) {
      if (Common::StartsWith(config_.group_column, name_prefix)) {
        std::string name = config_.group_column.substr(name_prefix.size());
Guolin Ke's avatar
Guolin Ke committed
125
126
127
128
129
130
131
        if (name2idx.count(name) > 0) {
          group_idx_ = name2idx[name];
          Log::Info("Using column %s as group/query id", name.c_str());
        } else {
          Log::Fatal("Could not find group/query column %s in data file", name.c_str());
        }
      } else {
Guolin Ke's avatar
Guolin Ke committed
132
        if (!Common::AtoiAndCheck(config_.group_column.c_str(), &group_idx_)) {
133
134
135
          Log::Fatal("group_column is not a number,\n"
                     "if you want to use a column name,\n"
                     "please add the prefix \"name:\" to the column name");
Guolin Ke's avatar
Guolin Ke committed
136
137
138
139
        }
        Log::Info("Using column number %d as group/query id", group_idx_);
      }
      ignore_features_.emplace(group_idx_);
Guolin Ke's avatar
Guolin Ke committed
140
141
    }
  }
Guolin Ke's avatar
Guolin Ke committed
142
143
144
  if (config_.categorical_feature.size() > 0) {
    if (Common::StartsWith(config_.categorical_feature, name_prefix)) {
      std::string names = config_.categorical_feature.substr(name_prefix.size());
145
146
147
148
149
      for (auto name : Common::Split(names.c_str(), ',')) {
        if (name2idx.count(name) > 0) {
          int tmp = name2idx[name];
          categorical_features_.emplace(tmp);
        } else {
Guolin Ke's avatar
Guolin Ke committed
150
          Log::Fatal("Could not find categorical_feature %s in data file", name.c_str());
151
152
153
        }
      }
    } else {
Guolin Ke's avatar
Guolin Ke committed
154
      for (auto token : Common::Split(config_.categorical_feature.c_str(), ',')) {
155
156
        int tmp = 0;
        if (!Common::AtoiAndCheck(token.c_str(), &tmp)) {
Guolin Ke's avatar
Guolin Ke committed
157
          Log::Fatal("categorical_feature is not a number,\n"
158
159
                     "if you want to use a column name,\n"
                     "please add the prefix \"name:\" to the column name");
160
161
162
163
164
        }
        categorical_features_.emplace(tmp);
      }
    }
  }
Guolin Ke's avatar
Guolin Ke committed
165
166
}

167
Dataset* DatasetLoader::LoadFromFile(const char* filename, const char* initscore_file, int rank, int num_machines) {
Guolin Ke's avatar
Guolin Ke committed
168
  // don't support query id in data file when training in parallel
Guolin Ke's avatar
Guolin Ke committed
169
  if (num_machines > 1 && !config_.pre_partition) {
Guolin Ke's avatar
Guolin Ke committed
170
    if (group_idx_ > 0) {
171
172
      Log::Fatal("Using a query id without pre-partitioning the data file is not supported for parallel training.\n"
                 "Please use an additional query file or pre-partition the data");
Guolin Ke's avatar
Guolin Ke committed
173
174
    }
  }
Guolin Ke's avatar
Guolin Ke committed
175
  auto dataset = std::unique_ptr<Dataset>(new Dataset());
Guolin Ke's avatar
Guolin Ke committed
176
177
  data_size_t num_global_data = 0;
  std::vector<data_size_t> used_data_indices;
178
179
  auto bin_filename = CheckCanLoadFromBin(filename);
  if (bin_filename.size() == 0) {
Guolin Ke's avatar
Guolin Ke committed
180
    auto parser = std::unique_ptr<Parser>(Parser::CreateParser(filename, config_.header, 0, label_idx_));
Guolin Ke's avatar
Guolin Ke committed
181
182
183
184
    if (parser == nullptr) {
      Log::Fatal("Could not recognize data format of %s", filename);
    }
    dataset->data_filename_ = filename;
Guolin Ke's avatar
Guolin Ke committed
185
    dataset->label_idx_ = label_idx_;
186
    dataset->metadata_.Init(filename, initscore_file);
Guolin Ke's avatar
Guolin Ke committed
187
    if (!config_.two_round) {
Guolin Ke's avatar
Guolin Ke committed
188
      // read data to memory
189
      auto text_data = LoadTextDataToMemory(filename, dataset->metadata_, rank, num_machines, &num_global_data, &used_data_indices);
Guolin Ke's avatar
Guolin Ke committed
190
191
192
193
      dataset->num_data_ = static_cast<data_size_t>(text_data.size());
      // sample data
      auto sample_data = SampleTextDataFromMemory(text_data);
      // construct feature bin mappers
Guolin Ke's avatar
Guolin Ke committed
194
      ConstructBinMappersFromTextData(rank, num_machines, sample_data, parser.get(), dataset.get());
Guolin Ke's avatar
Guolin Ke committed
195
      // initialize label
196
      dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_);
Guolin Ke's avatar
Guolin Ke committed
197
      // extract features
Guolin Ke's avatar
Guolin Ke committed
198
      ExtractFeaturesFromMemory(&text_data, parser.get(), dataset.get());
Guolin Ke's avatar
Guolin Ke committed
199
200
201
202
203
204
205
206
207
208
      text_data.clear();
    } else {
      // sample data from file
      auto sample_data = SampleTextDataFromFile(filename, dataset->metadata_, rank, num_machines, &num_global_data, &used_data_indices);
      if (used_data_indices.size() > 0) {
        dataset->num_data_ = static_cast<data_size_t>(used_data_indices.size());
      } else {
        dataset->num_data_ = num_global_data;
      }
      // construct feature bin mappers
Guolin Ke's avatar
Guolin Ke committed
209
      ConstructBinMappersFromTextData(rank, num_machines, sample_data, parser.get(), dataset.get());
Guolin Ke's avatar
Guolin Ke committed
210
      // initialize label
211
      dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_);
Guolin Ke's avatar
Guolin Ke committed
212
213

      // extract features
Guolin Ke's avatar
Guolin Ke committed
214
      ExtractFeaturesFromFile(filename, parser.get(), used_data_indices, dataset.get());
Guolin Ke's avatar
Guolin Ke committed
215
216
217
    }
  } else {
    // load data from binary file
218
    dataset.reset(LoadFromBinFile(filename, bin_filename.c_str(), rank, num_machines, &num_global_data, &used_data_indices));
Guolin Ke's avatar
Guolin Ke committed
219
220
221
222
  }
  // check meta data
  dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices);
  // need to check training data
Guolin Ke's avatar
Guolin Ke committed
223
224
  CheckDataset(dataset.get());
  return dataset.release();
Guolin Ke's avatar
Guolin Ke committed
225
226
227
228
}



229
Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, const char* initscore_file, const Dataset* train_data) {
Guolin Ke's avatar
Guolin Ke committed
230
231
  data_size_t num_global_data = 0;
  std::vector<data_size_t> used_data_indices;
Guolin Ke's avatar
Guolin Ke committed
232
  auto dataset = std::unique_ptr<Dataset>(new Dataset());
233
234
  auto bin_filename = CheckCanLoadFromBin(filename);
  if (bin_filename.size() == 0) {
Guolin Ke's avatar
Guolin Ke committed
235
    auto parser = std::unique_ptr<Parser>(Parser::CreateParser(filename, config_.header, 0, label_idx_));
Guolin Ke's avatar
Guolin Ke committed
236
237
238
239
    if (parser == nullptr) {
      Log::Fatal("Could not recognize data format of %s", filename);
    }
    dataset->data_filename_ = filename;
Guolin Ke's avatar
Guolin Ke committed
240
    dataset->label_idx_ = label_idx_;
241
    dataset->metadata_.Init(filename, initscore_file);
Guolin Ke's avatar
Guolin Ke committed
242
    if (!config_.two_round) {
Guolin Ke's avatar
Guolin Ke committed
243
244
245
246
      // read data in memory
      auto text_data = LoadTextDataToMemory(filename, dataset->metadata_, 0, 1, &num_global_data, &used_data_indices);
      dataset->num_data_ = static_cast<data_size_t>(text_data.size());
      // initialize label
247
      dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_);
Guolin Ke's avatar
Guolin Ke committed
248
      dataset->CreateValid(train_data);
Guolin Ke's avatar
Guolin Ke committed
249
      // extract features
Guolin Ke's avatar
Guolin Ke committed
250
      ExtractFeaturesFromMemory(&text_data, parser.get(), dataset.get());
Guolin Ke's avatar
Guolin Ke committed
251
252
      text_data.clear();
    } else {
Guolin Ke's avatar
Guolin Ke committed
253
      TextReader<data_size_t> text_reader(filename, config_.header);
Guolin Ke's avatar
Guolin Ke committed
254
255
256
257
      // Get number of lines of data file
      dataset->num_data_ = static_cast<data_size_t>(text_reader.CountLine());
      num_global_data = dataset->num_data_;
      // initialize label
258
      dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_);
Guolin Ke's avatar
Guolin Ke committed
259
      dataset->CreateValid(train_data);
Guolin Ke's avatar
Guolin Ke committed
260
      // extract features
Guolin Ke's avatar
Guolin Ke committed
261
      ExtractFeaturesFromFile(filename, parser.get(), used_data_indices, dataset.get());
Guolin Ke's avatar
Guolin Ke committed
262
263
264
    }
  } else {
    // load data from binary file
265
    dataset.reset(LoadFromBinFile(filename, bin_filename.c_str(), 0, 1, &num_global_data, &used_data_indices));
Guolin Ke's avatar
Guolin Ke committed
266
267
268
269
  }
  // not need to check validation data
  // check meta data
  dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices);
Guolin Ke's avatar
Guolin Ke committed
270
  return dataset.release();
Guolin Ke's avatar
Guolin Ke committed
271
272
}

273
Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices) {
Guolin Ke's avatar
Guolin Ke committed
274
  auto dataset = std::unique_ptr<Dataset>(new Dataset());
275
  auto reader = VirtualFileReader::Make(bin_filename);
Guolin Ke's avatar
Guolin Ke committed
276
  dataset->data_filename_ = data_filename;
277
  if (!reader->Init()) {
Guolin Ke's avatar
Guolin Ke committed
278
279
280
281
282
    Log::Fatal("Could not read binary data from %s", bin_filename);
  }

  // buffer to read binary file
  size_t buffer_size = 16 * 1024 * 1024;
Guolin Ke's avatar
Guolin Ke committed
283
  auto buffer = std::vector<char>(buffer_size);
284

285
286
  // check token
  size_t size_of_token = std::strlen(Dataset::binary_file_token);
287
288
  size_t read_cnt = reader->Read(buffer.data(), sizeof(char) * size_of_token);
  if (read_cnt != sizeof(char) * size_of_token) {
289
290
291
    Log::Fatal("Binary file error: token has the wrong size");
  }
  if (std::string(buffer.data()) != std::string(Dataset::binary_file_token)) {
292
    Log::Fatal("Input file is not LightGBM binary file");
293
  }
Guolin Ke's avatar
Guolin Ke committed
294
295

  // read size of header
296
  read_cnt = reader->Read(buffer.data(), sizeof(size_t));
Guolin Ke's avatar
Guolin Ke committed
297

298
  if (read_cnt != sizeof(size_t)) {
Guolin Ke's avatar
Guolin Ke committed
299
300
301
    Log::Fatal("Binary file error: header has the wrong size");
  }

Guolin Ke's avatar
Guolin Ke committed
302
  size_t size_of_head = *(reinterpret_cast<size_t*>(buffer.data()));
Guolin Ke's avatar
Guolin Ke committed
303
304
305
306

  // re-allocmate space if not enough
  if (size_of_head > buffer_size) {
    buffer_size = size_of_head;
Guolin Ke's avatar
Guolin Ke committed
307
    buffer.resize(buffer_size);
Guolin Ke's avatar
Guolin Ke committed
308
309
  }
  // read header
310
  read_cnt = reader->Read(buffer.data(), size_of_head);
Guolin Ke's avatar
Guolin Ke committed
311
312
313
314
315

  if (read_cnt != size_of_head) {
    Log::Fatal("Binary file error: header is incorrect");
  }
  // get header
Guolin Ke's avatar
Guolin Ke committed
316
  const char* mem_ptr = buffer.data();
Guolin Ke's avatar
Guolin Ke committed
317
318
319
320
321
322
  dataset->num_data_ = *(reinterpret_cast<const data_size_t*>(mem_ptr));
  mem_ptr += sizeof(dataset->num_data_);
  dataset->num_features_ = *(reinterpret_cast<const int*>(mem_ptr));
  mem_ptr += sizeof(dataset->num_features_);
  dataset->num_total_features_ = *(reinterpret_cast<const int*>(mem_ptr));
  mem_ptr += sizeof(dataset->num_total_features_);
Guolin Ke's avatar
Guolin Ke committed
323
324
  dataset->label_idx_ = *(reinterpret_cast<const int*>(mem_ptr));
  mem_ptr += sizeof(dataset->label_idx_);
325
326
327
328
329
330
331
332
333
334
  dataset->max_bin_ = *(reinterpret_cast<const int*>(mem_ptr));
  mem_ptr += sizeof(dataset->max_bin_);
  dataset->bin_construct_sample_cnt_ = *(reinterpret_cast<const int*>(mem_ptr));
  mem_ptr += sizeof(dataset->bin_construct_sample_cnt_);
  dataset->min_data_in_bin_ = *(reinterpret_cast<const int*>(mem_ptr));
  mem_ptr += sizeof(dataset->min_data_in_bin_);
  dataset->use_missing_ = *(reinterpret_cast<const bool*>(mem_ptr));
  mem_ptr += sizeof(dataset->use_missing_);
  dataset->zero_as_missing_ = *(reinterpret_cast<const bool*>(mem_ptr));
  mem_ptr += sizeof(dataset->zero_as_missing_);
Guolin Ke's avatar
Guolin Ke committed
335
336
  dataset->sparse_threshold_ = *(reinterpret_cast<const double*>(mem_ptr));
  mem_ptr += sizeof(dataset->sparse_threshold_);
Guolin Ke's avatar
Guolin Ke committed
337
338
  const int* tmp_feature_map = reinterpret_cast<const int*>(mem_ptr);
  dataset->used_feature_map_.clear();
Guolin Ke's avatar
Guolin Ke committed
339
  for (int i = 0; i < dataset->num_total_features_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
340
341
    dataset->used_feature_map_.push_back(tmp_feature_map[i]);
  }
Guolin Ke's avatar
Guolin Ke committed
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
  mem_ptr += sizeof(int) * dataset->num_total_features_;
  // num_groups
  dataset->num_groups_ = *(reinterpret_cast<const int*>(mem_ptr));
  mem_ptr += sizeof(dataset->num_groups_);
  // real_feature_idx_
  const int* tmp_ptr_real_feature_idx_ = reinterpret_cast<const int*>(mem_ptr);
  dataset->real_feature_idx_.clear();
  for (int i = 0; i < dataset->num_features_; ++i) {
    dataset->real_feature_idx_.push_back(tmp_ptr_real_feature_idx_[i]);
  }
  mem_ptr += sizeof(int) * dataset->num_features_;
  // feature2group
  const int* tmp_ptr_feature2group = reinterpret_cast<const int*>(mem_ptr);
  dataset->feature2group_.clear();
  for (int i = 0; i < dataset->num_features_; ++i) {
    dataset->feature2group_.push_back(tmp_ptr_feature2group[i]);
  }
  mem_ptr += sizeof(int) * dataset->num_features_;
  // feature2subfeature
  const int* tmp_ptr_feature2subfeature = reinterpret_cast<const int*>(mem_ptr);
  dataset->feature2subfeature_.clear();
  for (int i = 0; i < dataset->num_features_; ++i) {
    dataset->feature2subfeature_.push_back(tmp_ptr_feature2subfeature[i]);
  }
  mem_ptr += sizeof(int) * dataset->num_features_;
  // group_bin_boundaries
  const uint64_t* tmp_ptr_group_bin_boundaries = reinterpret_cast<const uint64_t*>(mem_ptr);
  dataset->group_bin_boundaries_.clear();
  for (int i = 0; i < dataset->num_groups_ + 1; ++i) {
    dataset->group_bin_boundaries_.push_back(tmp_ptr_group_bin_boundaries[i]);
  }
  mem_ptr += sizeof(uint64_t) * (dataset->num_groups_ + 1);

  // group_feature_start_
  const int* tmp_ptr_group_feature_start = reinterpret_cast<const int*>(mem_ptr);
  dataset->group_feature_start_.clear();
378
  for (int i = 0; i < dataset->num_groups_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
379
380
381
382
383
384
385
386
387
388
389
390
    dataset->group_feature_start_.push_back(tmp_ptr_group_feature_start[i]);
  }
  mem_ptr += sizeof(int) * (dataset->num_groups_);

  // group_feature_cnt_
  const int* tmp_ptr_group_feature_cnt = reinterpret_cast<const int*>(mem_ptr);
  dataset->group_feature_cnt_.clear();
  for (int i = 0; i < dataset->num_groups_; ++i) {
    dataset->group_feature_cnt_.push_back(tmp_ptr_group_feature_cnt[i]);
  }
  mem_ptr += sizeof(int) * (dataset->num_groups_);

391
  if (!config_.monotone_constraints.empty()) {
392
    CHECK(static_cast<size_t>(dataset->num_total_features_) == config_.monotone_constraints.size());
393
    dataset->monotone_types_.resize(dataset->num_features_);
394
    for (int i = 0; i < dataset->num_total_features_; ++i) {
395
      int inner_fidx = dataset->InnerFeatureIndex(i);
396
      if (inner_fidx >= 0) {
397
398
399
        dataset->monotone_types_[inner_fidx] = config_.monotone_constraints[i];
      }
    }
400
  } else {
401
402
403
404
405
    const int8_t* tmp_ptr_monotone_type = reinterpret_cast<const int8_t*>(mem_ptr);
    dataset->monotone_types_.clear();
    for (int i = 0; i < dataset->num_features_; ++i) {
      dataset->monotone_types_.push_back(tmp_ptr_monotone_type[i]);
    }
Guolin Ke's avatar
Guolin Ke committed
406
407
408
409
410
411
412
  }
  mem_ptr += sizeof(int8_t) * (dataset->num_features_);

  if (ArrayArgs<int8_t>::CheckAllZero(dataset->monotone_types_)) {
    dataset->monotone_types_.clear();
  }

413
  if (!config_.feature_contri.empty()) {
414
    CHECK(static_cast<size_t>(dataset->num_total_features_) == config_.feature_contri.size());
415
    dataset->feature_penalty_.resize(dataset->num_features_);
416
    for (int i = 0; i < dataset->num_total_features_; ++i) {
417
      int inner_fidx = dataset->InnerFeatureIndex(i);
418
      if (inner_fidx >= 0) {
419
420
421
        dataset->feature_penalty_[inner_fidx] = config_.feature_contri[i];
      }
    }
422
  } else {
423
424
425
426
427
    const double* tmp_ptr_feature_penalty = reinterpret_cast<const double*>(mem_ptr);
    dataset->feature_penalty_.clear();
    for (int i = 0; i < dataset->num_features_; ++i) {
      dataset->feature_penalty_.push_back(tmp_ptr_feature_penalty[i]);
    }
Guolin Ke's avatar
Guolin Ke committed
428
429
430
431
432
433
434
  }
  mem_ptr += sizeof(double) * (dataset->num_features_);

  if (ArrayArgs<double>::CheckAll(dataset->feature_penalty_, 1)) {
    dataset->feature_penalty_.clear();
  }

Belinda Trotta's avatar
Belinda Trotta committed
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
  if (!config_.max_bin_by_feature.empty()) {
    CHECK(static_cast<size_t>(dataset->num_total_features_) == config_.max_bin_by_feature.size());
    CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1);
    dataset->max_bin_by_feature_.resize(dataset->num_total_features_);
    dataset->max_bin_by_feature_.assign(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end());
  } else {
    const int32_t* tmp_ptr_max_bin_by_feature = reinterpret_cast<const int32_t*>(mem_ptr);
    dataset->max_bin_by_feature_.clear();
    for (int i = 0; i < dataset->num_total_features_; ++i) {
      dataset->max_bin_by_feature_.push_back(tmp_ptr_max_bin_by_feature[i]);
    }
  }
  mem_ptr += sizeof(int32_t) * (dataset->num_total_features_);
  if (ArrayArgs<int32_t>::CheckAll(dataset->max_bin_by_feature_, -1)) {
    dataset->max_bin_by_feature_.clear();
  }

Guolin Ke's avatar
Guolin Ke committed
452
  // get feature names
Guolin Ke's avatar
Guolin Ke committed
453
  dataset->feature_names_.clear();
Guolin Ke's avatar
Guolin Ke committed
454
455
456
457
458
459
460
461
462
463
  // write feature names
  for (int i = 0; i < dataset->num_total_features_; ++i) {
    int str_len = *(reinterpret_cast<const int*>(mem_ptr));
    mem_ptr += sizeof(int);
    std::stringstream str_buf;
    for (int j = 0; j < str_len; ++j) {
      char tmp_char = *(reinterpret_cast<const char*>(mem_ptr));
      mem_ptr += sizeof(char);
      str_buf << tmp_char;
    }
Guolin Ke's avatar
Guolin Ke committed
464
    dataset->feature_names_.emplace_back(str_buf.str());
Guolin Ke's avatar
Guolin Ke committed
465
  }
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
  // get forced_bin_bounds_
  dataset->forced_bin_bounds_ = std::vector<std::vector<double>>(dataset->num_total_features_, std::vector<double>());
  for (int i = 0; i < dataset->num_total_features_; ++i) {
    int num_bounds = *(reinterpret_cast<const int*>(mem_ptr));
    mem_ptr += sizeof(int);
    dataset->forced_bin_bounds_[i] = std::vector<double>();
    const double* tmp_ptr_forced_bounds = reinterpret_cast<const double*>(mem_ptr);
    
    for (int j = 0; j < num_bounds; ++j) {
      double bound = tmp_ptr_forced_bounds[j];
      dataset->forced_bin_bounds_[i].push_back(bound);
    }
    mem_ptr += num_bounds * sizeof(double);
   
  }
Guolin Ke's avatar
Guolin Ke committed
481
482

  // read size of meta data
483
  read_cnt = reader->Read(buffer.data(), sizeof(size_t));
Guolin Ke's avatar
Guolin Ke committed
484

485
  if (read_cnt != sizeof(size_t)) {
Guolin Ke's avatar
Guolin Ke committed
486
487
488
    Log::Fatal("Binary file error: meta data has the wrong size");
  }

Guolin Ke's avatar
Guolin Ke committed
489
  size_t size_of_metadata = *(reinterpret_cast<size_t*>(buffer.data()));
Guolin Ke's avatar
Guolin Ke committed
490
491
492
493

  // re-allocate space if not enough
  if (size_of_metadata > buffer_size) {
    buffer_size = size_of_metadata;
Guolin Ke's avatar
Guolin Ke committed
494
    buffer.resize(buffer_size);
Guolin Ke's avatar
Guolin Ke committed
495
496
  }
  //  read meta data
497
  read_cnt = reader->Read(buffer.data(), size_of_metadata);
Guolin Ke's avatar
Guolin Ke committed
498
499
500
501
502

  if (read_cnt != size_of_metadata) {
    Log::Fatal("Binary file error: meta data is incorrect");
  }
  // load meta data
Guolin Ke's avatar
Guolin Ke committed
503
  dataset->metadata_.LoadFromMemory(buffer.data());
Guolin Ke's avatar
Guolin Ke committed
504

505
506
  *num_global_data = dataset->num_data_;
  used_data_indices->clear();
Guolin Ke's avatar
Guolin Ke committed
507
  // sample local used data if need to partition
Guolin Ke's avatar
Guolin Ke committed
508
  if (num_machines > 1 && !config_.pre_partition) {
Guolin Ke's avatar
Guolin Ke committed
509
510
511
512
    const data_size_t* query_boundaries = dataset->metadata_.query_boundaries();
    if (query_boundaries == nullptr) {
      // if not contain query file, minimal sample unit is one record
      for (data_size_t i = 0; i < dataset->num_data_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
513
        if (random_.NextShort(0, num_machines) == rank) {
514
          used_data_indices->push_back(i);
Guolin Ke's avatar
Guolin Ke committed
515
516
517
518
519
520
521
522
523
        }
      }
    } else {
      // if contain query file, minimal sample unit is one query
      data_size_t num_queries = dataset->metadata_.num_queries();
      data_size_t qid = -1;
      bool is_query_used = false;
      for (data_size_t i = 0; i < dataset->num_data_; ++i) {
        if (qid >= num_queries) {
524
525
          Log::Fatal("Current query exceeds the range of the query file,\n"
                     "please ensure the query file is correct");
Guolin Ke's avatar
Guolin Ke committed
526
527
528
529
        }
        if (i >= query_boundaries[qid + 1]) {
          // if is new query
          is_query_used = false;
Guolin Ke's avatar
Guolin Ke committed
530
          if (random_.NextShort(0, num_machines) == rank) {
Guolin Ke's avatar
Guolin Ke committed
531
532
533
534
535
            is_query_used = true;
          }
          ++qid;
        }
        if (is_query_used) {
536
          used_data_indices->push_back(i);
Guolin Ke's avatar
Guolin Ke committed
537
538
539
        }
      }
    }
540
    dataset->num_data_ = static_cast<data_size_t>((*used_data_indices).size());
Guolin Ke's avatar
Guolin Ke committed
541
  }
542
  dataset->metadata_.PartitionLabel(*used_data_indices);
Guolin Ke's avatar
Guolin Ke committed
543
  // read feature data
Guolin Ke's avatar
Guolin Ke committed
544
  for (int i = 0; i < dataset->num_groups_; ++i) {
Guolin Ke's avatar
Guolin Ke committed
545
    // read feature size
546
547
    read_cnt = reader->Read(buffer.data(), sizeof(size_t));
    if (read_cnt != sizeof(size_t)) {
Guolin Ke's avatar
Guolin Ke committed
548
549
      Log::Fatal("Binary file error: feature %d has the wrong size", i);
    }
Guolin Ke's avatar
Guolin Ke committed
550
    size_t size_of_feature = *(reinterpret_cast<size_t*>(buffer.data()));
Guolin Ke's avatar
Guolin Ke committed
551
552
553
    // re-allocate space if not enough
    if (size_of_feature > buffer_size) {
      buffer_size = size_of_feature;
Guolin Ke's avatar
Guolin Ke committed
554
      buffer.resize(buffer_size);
Guolin Ke's avatar
Guolin Ke committed
555
556
    }

557
    read_cnt = reader->Read(buffer.data(), size_of_feature);
Guolin Ke's avatar
Guolin Ke committed
558
559
560
561

    if (read_cnt != size_of_feature) {
      Log::Fatal("Binary file error: feature %d is incorrect, read count: %d", i, read_cnt);
    }
Guolin Ke's avatar
Guolin Ke committed
562
    dataset->feature_groups_.emplace_back(std::unique_ptr<FeatureGroup>(
563
564
      new FeatureGroup(buffer.data(),
                       *num_global_data,
565
                       *used_data_indices)));
Guolin Ke's avatar
Guolin Ke committed
566
  }
Guolin Ke's avatar
Guolin Ke committed
567
  dataset->feature_groups_.shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
568
  dataset->is_finish_load_ = true;
Guolin Ke's avatar
Guolin Ke committed
569
  return dataset.release();
Guolin Ke's avatar
Guolin Ke committed
570
571
}

572

573
574
575
576
Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
                                               int** sample_indices, int num_col, const int* num_per_col,
                                               size_t total_sample_size, data_size_t num_data) {
  std::vector<std::unique_ptr<BinMapper>> bin_mappers(num_col);
577
578
  // fill feature_names_ if not header
  if (feature_names_.empty()) {
579
    for (int i = 0; i < num_col; ++i) {
580
581
582
583
584
      std::stringstream str_buf;
      str_buf << "Column_" << i;
      feature_names_.push_back(str_buf.str());
    }
  }
Belinda Trotta's avatar
Belinda Trotta committed
585
586
587
588
  if (!config_.max_bin_by_feature.empty()) {
    CHECK(static_cast<size_t>(num_col) == config_.max_bin_by_feature.size());
    CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1);
  }
589
590
591
592
593

  // get forced split
  std::string forced_bins_path = config_.forcedbins_filename;
  std::vector<std::vector<double>> forced_bin_bounds = DatasetLoader::GetForcedBins(forced_bins_path, num_col, categorical_features_);

Guolin Ke's avatar
Guolin Ke committed
594
  const data_size_t filter_cnt = static_cast<data_size_t>(
Guolin Ke's avatar
Guolin Ke committed
595
    static_cast<double>(config_.min_data_in_leaf * total_sample_size) / num_data);
596
597
598
599
600
601
602
603
604
605
606
607
608
  if (Network::num_machines() == 1) {
    // if only one machine, find bin locally
    OMP_INIT_EX();
    #pragma omp parallel for schedule(guided)
    for (int i = 0; i < num_col; ++i) {
      OMP_LOOP_EX_BEGIN();
      if (ignore_features_.count(i) > 0) {
        bin_mappers[i] = nullptr;
        continue;
      }
      BinType bin_type = BinType::NumericalBin;
      if (categorical_features_.count(i)) {
        bin_type = BinType::CategoricalBin;
609
610
611
612
        bool feat_is_unconstrained = ((config_.monotone_constraints.size() == 0) || (config_.monotone_constraints[i] == 0));
        if (!feat_is_unconstrained) {
            Log::Fatal("The output cannot be monotone with respect to categorical features");
        }
613
614
      }
      bin_mappers[i].reset(new BinMapper());
Belinda Trotta's avatar
Belinda Trotta committed
615
616
      if (config_.max_bin_by_feature.empty()) {
        bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size,
617
                                config_.max_bin, config_.min_data_in_bin, filter_cnt,
618
619
                                bin_type, config_.use_missing, config_.zero_as_missing,
                                forced_bin_bounds[i]);
Belinda Trotta's avatar
Belinda Trotta committed
620
621
      } else {
        bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size,
622
623
                                config_.max_bin_by_feature[i], config_.min_data_in_bin,
                                filter_cnt, bin_type, config_.use_missing,
624
                                config_.zero_as_missing, forced_bin_bounds[i]);
Belinda Trotta's avatar
Belinda Trotta committed
625
      }
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
      OMP_LOOP_EX_END();
    }
    OMP_THROW_EX();
  } else {
    // if have multi-machines, need to find bin distributed
    // different machines will find bin for different features
    int num_machines = Network::num_machines();
    int rank = Network::rank();
    int total_num_feature = num_col;
    total_num_feature = Network::GlobalSyncUpByMin(total_num_feature);
    // start and len will store the process feature indices for different machines
    // machine i will find bins for features in [ start[i], start[i] + len[i] )
    std::vector<int> start(num_machines);
    std::vector<int> len(num_machines);
    int step = (total_num_feature + num_machines - 1) / num_machines;
    if (step < 1) { step = 1; }

    start[0] = 0;
    for (int i = 0; i < num_machines - 1; ++i) {
      len[i] = std::min(step, total_num_feature - start[i]);
      start[i + 1] = start[i] + len[i];
    }
    len[num_machines - 1] = total_num_feature - start[num_machines - 1];
    OMP_INIT_EX();
    #pragma omp parallel for schedule(guided)
    for (int i = 0; i < len[rank]; ++i) {
      OMP_LOOP_EX_BEGIN();
      if (ignore_features_.count(start[rank] + i) > 0) {
        continue;
      }
      BinType bin_type = BinType::NumericalBin;
      if (categorical_features_.count(start[rank] + i)) {
        bin_type = BinType::CategoricalBin;
      }
      bin_mappers[i].reset(new BinMapper());
Belinda Trotta's avatar
Belinda Trotta committed
661
      if (config_.max_bin_by_feature.empty()) {
662
663
        bin_mappers[i]->FindBin(sample_values[start[rank] + i], num_per_col[start[rank] + i],
                                total_sample_size, config_.max_bin, config_.min_data_in_bin,
664
665
                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, 
                                forced_bin_bounds[i]);
Belinda Trotta's avatar
Belinda Trotta committed
666
      } else {
667
668
669
        bin_mappers[i]->FindBin(sample_values[start[rank] + i], num_per_col[start[rank] + i],
                                total_sample_size, config_.max_bin_by_feature[start[rank] + i],
                                config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing,
670
                                config_.zero_as_missing, forced_bin_bounds[i]);
Belinda Trotta's avatar
Belinda Trotta committed
671
      }
672
673
674
675
676
677
678
679
      OMP_LOOP_EX_END();
    }
    OMP_THROW_EX();
    int max_bin = 0;
    for (int i = 0; i < len[rank]; ++i) {
      if (bin_mappers[i] != nullptr) {
        max_bin = std::max(max_bin, bin_mappers[i]->num_bin());
      }
Guolin Ke's avatar
Guolin Ke committed
680
    }
681
682
683
684
    max_bin = Network::GlobalSyncUpByMax(max_bin);
    // get size of bin mapper with max_bin size
    int type_size = BinMapper::SizeForSpecificBin(max_bin);
    // since sizes of different feature may not be same, we expand all bin mapper to type_size
Guolin Ke's avatar
Guolin Ke committed
685
    comm_size_t buffer_size = type_size * total_num_feature;
Guolin Ke's avatar
Guolin Ke committed
686
    CHECK(buffer_size >= 0);
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
    auto input_buffer = std::vector<char>(buffer_size);
    auto output_buffer = std::vector<char>(buffer_size);

    // find local feature bins and copy to buffer
    #pragma omp parallel for schedule(guided)
    for (int i = 0; i < len[rank]; ++i) {
      OMP_LOOP_EX_BEGIN();
      if (ignore_features_.count(start[rank] + i) > 0) {
        continue;
      }
      bin_mappers[i]->CopyTo(input_buffer.data() + i * type_size);
      // free
      bin_mappers[i].reset(nullptr);
      OMP_LOOP_EX_END();
    }
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
703
704
    std::vector<comm_size_t> size_start(num_machines);
    std::vector<comm_size_t> size_len(num_machines);
705
706
    // convert to binary size
    for (int i = 0; i < num_machines; ++i) {
Guolin Ke's avatar
Guolin Ke committed
707
708
      size_start[i] = start[i] * static_cast<comm_size_t>(type_size);
      size_len[i] = len[i] * static_cast<comm_size_t>(type_size);
709
710
    }
    // gather global feature bin mappers
Guolin Ke's avatar
Guolin Ke committed
711
    Network::Allgather(input_buffer.data(), size_start.data(), size_len.data(), output_buffer.data(), buffer_size);
712
713
714
715
716
717
718
719
    // restore features bins from buffer
    for (int i = 0; i < total_num_feature; ++i) {
      if (ignore_features_.count(i) > 0) {
        bin_mappers[i] = nullptr;
        continue;
      }
      bin_mappers[i].reset(new BinMapper());
      bin_mappers[i]->CopyFrom(output_buffer.data() + i * type_size);
720
    }
Guolin Ke's avatar
Guolin Ke committed
721
  }
Guolin Ke's avatar
Guolin Ke committed
722
  auto dataset = std::unique_ptr<Dataset>(new Dataset(num_data));
723
  dataset->Construct(&bin_mappers, forced_bin_bounds, sample_indices, num_per_col, total_sample_size, config_);
724
  dataset->set_feature_names(feature_names_);
Guolin Ke's avatar
Guolin Ke committed
725
  return dataset.release();
Guolin Ke's avatar
Guolin Ke committed
726
}
Guolin Ke's avatar
Guolin Ke committed
727
728
729
730
731
732


// ---- private functions ----

void DatasetLoader::CheckDataset(const Dataset* dataset) {
  if (dataset->num_data_ <= 0) {
Guolin Ke's avatar
Guolin Ke committed
733
    Log::Fatal("Data file %s is empty", dataset->data_filename_.c_str());
Guolin Ke's avatar
Guolin Ke committed
734
  }
735
736
  if (dataset->feature_names_.size() != static_cast<size_t>(dataset->num_total_features_)) {
    Log::Fatal("Size of feature name error, should be %d, got %d", dataset->num_total_features_,
737
               static_cast<int>(dataset->feature_names_.size()));
738
  }
Guolin Ke's avatar
Guolin Ke committed
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
  bool is_feature_order_by_group = true;
  int last_group = -1;
  int last_sub_feature = -1;
  // if features are ordered, not need to use hist_buf
  for (int i = 0; i < dataset->num_features_; ++i) {
    int group = dataset->feature2group_[i];
    int sub_feature = dataset->feature2subfeature_[i];
    if (group < last_group) {
      is_feature_order_by_group = false;
    } else if (group == last_group) {
      if (sub_feature <= last_sub_feature) {
        is_feature_order_by_group = false;
        break;
      }
    }
    last_group = group;
    last_sub_feature = sub_feature;
  }
  if (!is_feature_order_by_group) {
758
    Log::Fatal("Features in dataset should be ordered by group");
Guolin Ke's avatar
Guolin Ke committed
759
  }
Guolin Ke's avatar
Guolin Ke committed
760
761
762
}

std::vector<std::string> DatasetLoader::LoadTextDataToMemory(const char* filename, const Metadata& metadata,
763
764
                                                             int rank, int num_machines, int* num_global_data,
                                                             std::vector<data_size_t>* used_data_indices) {
Guolin Ke's avatar
Guolin Ke committed
765
  TextReader<data_size_t> text_reader(filename, config_.header);
Guolin Ke's avatar
Guolin Ke committed
766
  used_data_indices->clear();
Guolin Ke's avatar
Guolin Ke committed
767
  if (num_machines == 1 || config_.pre_partition) {
Guolin Ke's avatar
Guolin Ke committed
768
769
770
771
772
773
774
775
776
    // read all lines
    *num_global_data = text_reader.ReadAllLines();
  } else {  // need partition data
            // get query data
    const data_size_t* query_boundaries = metadata.query_boundaries();

    if (query_boundaries == nullptr) {
      // if not contain query data, minimal sample unit is one record
      *num_global_data = text_reader.ReadAndFilterLines([this, rank, num_machines](data_size_t) {
Guolin Ke's avatar
Guolin Ke committed
777
        if (random_.NextShort(0, num_machines) == rank) {
Guolin Ke's avatar
Guolin Ke committed
778
779
780
781
782
783
784
785
786
787
788
789
790
791
          return true;
        } else {
          return false;
        }
      }, used_data_indices);
    } else {
      // if contain query data, minimal sample unit is one query
      data_size_t num_queries = metadata.num_queries();
      data_size_t qid = -1;
      bool is_query_used = false;
      *num_global_data = text_reader.ReadAndFilterLines(
        [this, rank, num_machines, &qid, &query_boundaries, &is_query_used, num_queries]
      (data_size_t line_idx) {
        if (qid >= num_queries) {
792
793
          Log::Fatal("Current query exceeds the range of the query file,\n"
                     "please ensure the query file is correct");
Guolin Ke's avatar
Guolin Ke committed
794
795
796
797
        }
        if (line_idx >= query_boundaries[qid + 1]) {
          // if is new query
          is_query_used = false;
Guolin Ke's avatar
Guolin Ke committed
798
          if (random_.NextShort(0, num_machines) == rank) {
Guolin Ke's avatar
Guolin Ke committed
799
800
801
802
803
804
805
806
807
808
809
810
            is_query_used = true;
          }
          ++qid;
        }
        return is_query_used;
      }, used_data_indices);
    }
  }
  return std::move(text_reader.Lines());
}

std::vector<std::string> DatasetLoader::SampleTextDataFromMemory(const std::vector<std::string>& data) {
Guolin Ke's avatar
Guolin Ke committed
811
  int sample_cnt = config_.bin_construct_sample_cnt;
812
813
  if (static_cast<size_t>(sample_cnt) > data.size()) {
    sample_cnt = static_cast<int>(data.size());
814
  }
815
  auto sample_indices = random_.Sample(static_cast<int>(data.size()), sample_cnt);
Guolin Ke's avatar
Guolin Ke committed
816
  std::vector<std::string> out(sample_indices.size());
Guolin Ke's avatar
Guolin Ke committed
817
818
  for (size_t i = 0; i < sample_indices.size(); ++i) {
    const size_t idx = sample_indices[i];
Guolin Ke's avatar
Guolin Ke committed
819
    out[i] = data[idx];
Guolin Ke's avatar
Guolin Ke committed
820
821
822
823
824
  }
  return out;
}

std::vector<std::string> DatasetLoader::SampleTextDataFromFile(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices) {
Guolin Ke's avatar
Guolin Ke committed
825
826
  const data_size_t sample_cnt = static_cast<data_size_t>(config_.bin_construct_sample_cnt);
  TextReader<data_size_t> text_reader(filename, config_.header);
Guolin Ke's avatar
Guolin Ke committed
827
  std::vector<std::string> out_data;
Guolin Ke's avatar
Guolin Ke committed
828
  if (num_machines == 1 || config_.pre_partition) {
Guolin Ke's avatar
Guolin Ke committed
829
    *num_global_data = static_cast<data_size_t>(text_reader.SampleFromFile(&random_, sample_cnt, &out_data));
Guolin Ke's avatar
Guolin Ke committed
830
831
832
833
834
835
836
  } else {  // need partition data
            // get query data
    const data_size_t* query_boundaries = metadata.query_boundaries();
    if (query_boundaries == nullptr) {
      // if not contain query file, minimal sample unit is one record
      *num_global_data = text_reader.SampleAndFilterFromFile([this, rank, num_machines]
      (data_size_t) {
Guolin Ke's avatar
Guolin Ke committed
837
        if (random_.NextShort(0, num_machines) == rank) {
Guolin Ke's avatar
Guolin Ke committed
838
839
840
841
          return true;
        } else {
          return false;
        }
Guolin Ke's avatar
Guolin Ke committed
842
      }, used_data_indices, &random_, sample_cnt, &out_data);
Guolin Ke's avatar
Guolin Ke committed
843
844
845
846
847
848
849
850
851
    } else {
      // if contain query file, minimal sample unit is one query
      data_size_t num_queries = metadata.num_queries();
      data_size_t qid = -1;
      bool is_query_used = false;
      *num_global_data = text_reader.SampleAndFilterFromFile(
        [this, rank, num_machines, &qid, &query_boundaries, &is_query_used, num_queries]
      (data_size_t line_idx) {
        if (qid >= num_queries) {
852
853
          Log::Fatal("Query id exceeds the range of the query file, "
                     "please ensure the query file is correct");
Guolin Ke's avatar
Guolin Ke committed
854
855
856
857
        }
        if (line_idx >= query_boundaries[qid + 1]) {
          // if is new query
          is_query_used = false;
Guolin Ke's avatar
Guolin Ke committed
858
          if (random_.NextShort(0, num_machines) == rank) {
Guolin Ke's avatar
Guolin Ke committed
859
860
861
862
863
            is_query_used = true;
          }
          ++qid;
        }
        return is_query_used;
Guolin Ke's avatar
Guolin Ke committed
864
      }, used_data_indices, &random_, sample_cnt, &out_data);
Guolin Ke's avatar
Guolin Ke committed
865
866
867
868
869
870
871
    }
  }
  return out_data;
}

void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, const std::vector<std::string>& sample_data, const Parser* parser, Dataset* dataset) {
  std::vector<std::vector<double>> sample_values;
Guolin Ke's avatar
Guolin Ke committed
872
  std::vector<std::vector<int>> sample_indices;
Guolin Ke's avatar
Guolin Ke committed
873
874
  std::vector<std::pair<int, double>> oneline_features;
  double label;
Guolin Ke's avatar
Guolin Ke committed
875
  for (int i = 0; i < static_cast<int>(sample_data.size()); ++i) {
Guolin Ke's avatar
Guolin Ke committed
876
877
878
879
    oneline_features.clear();
    // parse features
    parser->ParseOneLine(sample_data[i].c_str(), &oneline_features, &label);
    for (std::pair<int, double>& inner_data : oneline_features) {
880
      if (static_cast<size_t>(inner_data.first) >= sample_values.size()) {
Guolin Ke's avatar
Guolin Ke committed
881
882
        sample_values.resize(inner_data.first + 1);
        sample_indices.resize(inner_data.first + 1);
883
      }
Guolin Ke's avatar
Guolin Ke committed
884
      if (std::fabs(inner_data.second) > kZeroThreshold || std::isnan(inner_data.second)) {
Guolin Ke's avatar
Guolin Ke committed
885
886
        sample_values[inner_data.first].emplace_back(inner_data.second);
        sample_indices[inner_data.first].emplace_back(i);
Guolin Ke's avatar
Guolin Ke committed
887
888
889
890
      }
    }
  }

Guolin Ke's avatar
Guolin Ke committed
891
  dataset->feature_groups_.clear();
Guolin Ke's avatar
Guolin Ke committed
892

893
894
  if (feature_names_.empty()) {
    // -1 means doesn't use this feature
Guolin Ke's avatar
Guolin Ke committed
895
896
    dataset->num_total_features_ = std::max(static_cast<int>(sample_values.size()), parser->TotalColumns() - 1);
    dataset->used_feature_map_ = std::vector<int>(dataset->num_total_features_, -1);
897
898
899
900
  } else {
    dataset->used_feature_map_ = std::vector<int>(feature_names_.size(), -1);
    dataset->num_total_features_ = static_cast<int>(feature_names_.size());
  }
Guolin Ke's avatar
Guolin Ke committed
901

Belinda Trotta's avatar
Belinda Trotta committed
902
903
904
905
906
  if (!config_.max_bin_by_feature.empty()) {
    CHECK(static_cast<size_t>(dataset->num_total_features_) == config_.max_bin_by_feature.size());
    CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1);
  }

907
908
909
910
911
  // get forced split
  std::string forced_bins_path = config_.forcedbins_filename;
  std::vector<std::vector<double>> forced_bin_bounds = DatasetLoader::GetForcedBins(forced_bins_path, dataset->num_total_features_, 
                                                                                    categorical_features_);

Guolin Ke's avatar
Guolin Ke committed
912
913
914
915
916
917
  // check the range of label_idx, weight_idx and group_idx
  CHECK(label_idx_ >= 0 && label_idx_ <= dataset->num_total_features_);
  CHECK(weight_idx_ < 0 || weight_idx_ < dataset->num_total_features_);
  CHECK(group_idx_ < 0 || group_idx_ < dataset->num_total_features_);

  // fill feature_names_ if not header
Guolin Ke's avatar
Guolin Ke committed
918
  if (feature_names_.empty()) {
Guolin Ke's avatar
Guolin Ke committed
919
920
921
922
923
924
    for (int i = 0; i < dataset->num_total_features_; ++i) {
      std::stringstream str_buf;
      str_buf << "Column_" << i;
      feature_names_.push_back(str_buf.str());
    }
  }
925
  dataset->set_feature_names(feature_names_);
Guolin Ke's avatar
Guolin Ke committed
926
  std::vector<std::unique_ptr<BinMapper>> bin_mappers(dataset->num_total_features_);
Guolin Ke's avatar
Guolin Ke committed
927
  const data_size_t filter_cnt = static_cast<data_size_t>(
Guolin Ke's avatar
Guolin Ke committed
928
    static_cast<double>(config_.min_data_in_leaf* sample_data.size()) / dataset->num_data_);
Guolin Ke's avatar
Guolin Ke committed
929

Guolin Ke's avatar
Guolin Ke committed
930
931
932
  // start find bins
  if (num_machines == 1) {
    // if only one machine, find bin locally
933
    OMP_INIT_EX();
934
    #pragma omp parallel for schedule(guided)
Guolin Ke's avatar
Guolin Ke committed
935
    for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
936
      OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
937
      if (ignore_features_.count(i) > 0) {
Guolin Ke's avatar
Guolin Ke committed
938
        bin_mappers[i] = nullptr;
Guolin Ke's avatar
Guolin Ke committed
939
940
        continue;
      }
941
942
943
944
      BinType bin_type = BinType::NumericalBin;
      if (categorical_features_.count(i)) {
        bin_type = BinType::CategoricalBin;
      }
Guolin Ke's avatar
Guolin Ke committed
945
      bin_mappers[i].reset(new BinMapper());
Belinda Trotta's avatar
Belinda Trotta committed
946
947
      if (config_.max_bin_by_feature.empty()) {
        bin_mappers[i]->FindBin(sample_values[i].data(), static_cast<int>(sample_values[i].size()),
948
                                sample_data.size(), config_.max_bin, config_.min_data_in_bin,
949
950
                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing,
                                forced_bin_bounds[i]);
Belinda Trotta's avatar
Belinda Trotta committed
951
952
      } else {
        bin_mappers[i]->FindBin(sample_values[i].data(), static_cast<int>(sample_values[i].size()),
953
954
                                sample_data.size(), config_.max_bin_by_feature[i],
                                config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing,
955
                                config_.zero_as_missing, forced_bin_bounds[i]);
Belinda Trotta's avatar
Belinda Trotta committed
956
      }
957
      OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
958
    }
959
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
960
  } else {
961
    // if have multi-machines, need to find bin distributed
Guolin Ke's avatar
Guolin Ke committed
962
963
    // different machines will find bin for different features

Guolin Ke's avatar
Guolin Ke committed
964
965
966
    int num_total_features = dataset->num_total_features_;
    num_total_features = Network::GlobalSyncUpByMin(num_total_features);
    dataset->num_total_features_ = num_total_features;
Guolin Ke's avatar
Guolin Ke committed
967
    // start and len will store the process feature indices for different machines
968
    // machine i will find bins for features in [ start[i], start[i] + len[i] )
Guolin Ke's avatar
Guolin Ke committed
969
970
    std::vector<int> start(num_machines);
    std::vector<int> len(num_machines);
Guolin Ke's avatar
Guolin Ke committed
971
    int step = (num_total_features + num_machines - 1) / num_machines;
Guolin Ke's avatar
Guolin Ke committed
972
973
974
975
    if (step < 1) { step = 1; }

    start[0] = 0;
    for (int i = 0; i < num_machines - 1; ++i) {
Guolin Ke's avatar
Guolin Ke committed
976
      len[i] = std::min(step, num_total_features - start[i]);
Guolin Ke's avatar
Guolin Ke committed
977
978
      start[i + 1] = start[i] + len[i];
    }
Guolin Ke's avatar
Guolin Ke committed
979
    len[num_machines - 1] = num_total_features - start[num_machines - 1];
980
    OMP_INIT_EX();
981
    #pragma omp parallel for schedule(guided)
982
    for (int i = 0; i < len[rank]; ++i) {
983
      OMP_LOOP_EX_BEGIN();
984
985
986
987
988
989
990
991
      if (ignore_features_.count(start[rank] + i) > 0) {
        continue;
      }
      BinType bin_type = BinType::NumericalBin;
      if (categorical_features_.count(start[rank] + i)) {
        bin_type = BinType::CategoricalBin;
      }
      bin_mappers[i].reset(new BinMapper());
Belinda Trotta's avatar
Belinda Trotta committed
992
      if (config_.max_bin_by_feature.empty()) {
993
        bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(),
Belinda Trotta's avatar
Belinda Trotta committed
994
                                static_cast<int>(sample_values[start[rank] + i].size()),
995
                                sample_data.size(), config_.max_bin, config_.min_data_in_bin,
996
997
                                filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, 
                                forced_bin_bounds[i]);
Belinda Trotta's avatar
Belinda Trotta committed
998
      } else {
999
        bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(),
Belinda Trotta's avatar
Belinda Trotta committed
1000
                                static_cast<int>(sample_values[start[rank] + i].size()),
1001
1002
                                sample_data.size(), config_.max_bin_by_feature[i],
                                config_.min_data_in_bin, filter_cnt, bin_type,
1003
                                config_.use_missing, config_.zero_as_missing, forced_bin_bounds[i]);
Belinda Trotta's avatar
Belinda Trotta committed
1004
      }
1005
      OMP_LOOP_EX_END();
1006
    }
1007
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
1008
1009
1010
1011
1012
1013
1014
    int max_bin = 0;
    for (int i = 0; i < len[rank]; ++i) {
      if (bin_mappers[i] != nullptr) {
        max_bin = std::max(max_bin, bin_mappers[i]->num_bin());
      }
    }
    max_bin = Network::GlobalSyncUpByMax(max_bin);
1015
    // get size of bin mapper with max_bin size
1016
    int type_size = BinMapper::SizeForSpecificBin(max_bin);
Guolin Ke's avatar
Guolin Ke committed
1017
    // since sizes of different feature may not be same, we expand all bin mapper to type_size
Guolin Ke's avatar
Guolin Ke committed
1018
    comm_size_t buffer_size = type_size * num_total_features;
Guolin Ke's avatar
Guolin Ke committed
1019
    CHECK(buffer_size >= 0);
Guolin Ke's avatar
Guolin Ke committed
1020
1021
    auto input_buffer = std::vector<char>(buffer_size);
    auto output_buffer = std::vector<char>(buffer_size);
Guolin Ke's avatar
Guolin Ke committed
1022
1023

    // find local feature bins and copy to buffer
1024
    #pragma omp parallel for schedule(guided)
Guolin Ke's avatar
Guolin Ke committed
1025
    for (int i = 0; i < len[rank]; ++i) {
1026
      OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
1027
1028
1029
      if (ignore_features_.count(start[rank] + i) > 0) {
        continue;
      }
1030
1031
1032
      bin_mappers[i]->CopyTo(input_buffer.data() + i * type_size);
      // free
      bin_mappers[i].reset(nullptr);
1033
      OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
1034
    }
1035
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
1036
1037
    std::vector<comm_size_t> size_start(num_machines);
    std::vector<comm_size_t> size_len(num_machines);
Guolin Ke's avatar
Guolin Ke committed
1038
1039
    // convert to binary size
    for (int i = 0; i < num_machines; ++i) {
Guolin Ke's avatar
Guolin Ke committed
1040
1041
      size_start[i] = start[i] * static_cast<comm_size_t>(type_size);
      size_len[i] = len[i] * static_cast<comm_size_t>(type_size);
Guolin Ke's avatar
Guolin Ke committed
1042
1043
    }
    // gather global feature bin mappers
Guolin Ke's avatar
Guolin Ke committed
1044
    Network::Allgather(input_buffer.data(), size_start.data(), size_len.data(), output_buffer.data(), buffer_size);
Guolin Ke's avatar
Guolin Ke committed
1045
    // restore features bins from buffer
Guolin Ke's avatar
Guolin Ke committed
1046
    for (int i = 0; i < num_total_features; ++i) {
Guolin Ke's avatar
Guolin Ke committed
1047
      if (ignore_features_.count(i) > 0) {
Guolin Ke's avatar
Guolin Ke committed
1048
        bin_mappers[i] = nullptr;
Guolin Ke's avatar
Guolin Ke committed
1049
1050
        continue;
      }
Guolin Ke's avatar
Guolin Ke committed
1051
1052
      bin_mappers[i].reset(new BinMapper());
      bin_mappers[i]->CopyFrom(output_buffer.data() + i * type_size);
Guolin Ke's avatar
Guolin Ke committed
1053
1054
    }
  }
Guolin Ke's avatar
Guolin Ke committed
1055
  sample_values.clear();
1056
  dataset->Construct(&bin_mappers, forced_bin_bounds, Common::Vector2Ptr<int>(&sample_indices).data(),
Guolin Ke's avatar
Guolin Ke committed
1057
                     Common::VectorSize<int>(sample_indices).data(), sample_data.size(), config_);
Guolin Ke's avatar
Guolin Ke committed
1058
1059
1060
}

/*! \brief Extract local features from memory */
Guolin Ke's avatar
Guolin Ke committed
1061
void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>* text_data, const Parser* parser, Dataset* dataset) {
Guolin Ke's avatar
Guolin Ke committed
1062
1063
1064
  std::vector<std::pair<int, double>> oneline_features;
  double tmp_label = 0.0f;
  if (predict_fun_ == nullptr) {
1065
    OMP_INIT_EX();
Guolin Ke's avatar
Guolin Ke committed
1066
    // if doesn't need to prediction with initial model
1067
    #pragma omp parallel for schedule(static) private(oneline_features) firstprivate(tmp_label)
Guolin Ke's avatar
Guolin Ke committed
1068
    for (data_size_t i = 0; i < dataset->num_data_; ++i) {
1069
      OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
1070
1071
1072
      const int tid = omp_get_thread_num();
      oneline_features.clear();
      // parser
Guolin Ke's avatar
Guolin Ke committed
1073
      parser->ParseOneLine(text_data->at(i).c_str(), &oneline_features, &tmp_label);
Guolin Ke's avatar
Guolin Ke committed
1074
      // set label
1075
      dataset->metadata_.SetLabelAt(i, static_cast<label_t>(tmp_label));
Guolin Ke's avatar
Guolin Ke committed
1076
      // free processed line:
Guolin Ke's avatar
Guolin Ke committed
1077
      text_data->at(i).clear();
Guolin Ke's avatar
Guolin Ke committed
1078
1079
1080
1081
      // shrink_to_fit will be very slow in linux, and seems not free memory, disable for now
      // text_reader_->Lines()[i].shrink_to_fit();
      // push data
      for (auto& inner_data : oneline_features) {
1082
        if (inner_data.first >= dataset->num_total_features_) { continue; }
Guolin Ke's avatar
Guolin Ke committed
1083
1084
1085
        int feature_idx = dataset->used_feature_map_[inner_data.first];
        if (feature_idx >= 0) {
          // if is used feature
Guolin Ke's avatar
Guolin Ke committed
1086
1087
1088
          int group = dataset->feature2group_[feature_idx];
          int sub_feature = dataset->feature2subfeature_[feature_idx];
          dataset->feature_groups_[group]->PushData(tid, sub_feature, i, inner_data.second);
Guolin Ke's avatar
Guolin Ke committed
1089
1090
        } else {
          if (inner_data.first == weight_idx_) {
1091
            dataset->metadata_.SetWeightAt(i, static_cast<label_t>(inner_data.second));
Guolin Ke's avatar
Guolin Ke committed
1092
1093
1094
1095
1096
          } else if (inner_data.first == group_idx_) {
            dataset->metadata_.SetQueryAt(i, static_cast<data_size_t>(inner_data.second));
          }
        }
      }
1097
      OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
1098
    }
1099
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
1100
  } else {
1101
    OMP_INIT_EX();
Guolin Ke's avatar
Guolin Ke committed
1102
    // if need to prediction with initial model
1103
    std::vector<double> init_score(dataset->num_data_ * num_class_);
1104
    #pragma omp parallel for schedule(static) private(oneline_features) firstprivate(tmp_label)
Guolin Ke's avatar
Guolin Ke committed
1105
    for (data_size_t i = 0; i < dataset->num_data_; ++i) {
1106
      OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
1107
1108
1109
      const int tid = omp_get_thread_num();
      oneline_features.clear();
      // parser
Guolin Ke's avatar
Guolin Ke committed
1110
      parser->ParseOneLine(text_data->at(i).c_str(), &oneline_features, &tmp_label);
Guolin Ke's avatar
Guolin Ke committed
1111
      // set initial score
Guolin Ke's avatar
Guolin Ke committed
1112
1113
      std::vector<double> oneline_init_score(num_class_);
      predict_fun_(oneline_features, oneline_init_score.data());
1114
      for (int k = 0; k < num_class_; ++k) {
1115
        init_score[k * dataset->num_data_ + i] = static_cast<double>(oneline_init_score[k]);
Guolin Ke's avatar
Guolin Ke committed
1116
1117
      }
      // set label
1118
      dataset->metadata_.SetLabelAt(i, static_cast<label_t>(tmp_label));
Guolin Ke's avatar
Guolin Ke committed
1119
1120
1121
1122
1123
1124
      // free processed line:
      text_data[i].clear();
      // shrink_to_fit will be very slow in linux, and seems not free memory, disable for now
      // text_reader_->Lines()[i].shrink_to_fit();
      // push data
      for (auto& inner_data : oneline_features) {
1125
        if (inner_data.first >= dataset->num_total_features_) { continue; }
Guolin Ke's avatar
Guolin Ke committed
1126
1127
1128
        int feature_idx = dataset->used_feature_map_[inner_data.first];
        if (feature_idx >= 0) {
          // if is used feature
Guolin Ke's avatar
Guolin Ke committed
1129
1130
          int group = dataset->feature2group_[feature_idx];
          int sub_feature = dataset->feature2subfeature_[feature_idx];
1131
          dataset->feature_groups_[group]->PushData(tid, sub_feature, i, inner_data.second);
Guolin Ke's avatar
Guolin Ke committed
1132
1133
        } else {
          if (inner_data.first == weight_idx_) {
1134
            dataset->metadata_.SetWeightAt(i, static_cast<label_t>(inner_data.second));
Guolin Ke's avatar
Guolin Ke committed
1135
1136
1137
1138
1139
          } else if (inner_data.first == group_idx_) {
            dataset->metadata_.SetQueryAt(i, static_cast<data_size_t>(inner_data.second));
          }
        }
      }
1140
      OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
1141
    }
1142
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
1143
    // metadata_ will manage space of init_score
1144
    dataset->metadata_.SetInitScore(init_score.data(), dataset->num_data_ * num_class_);
Guolin Ke's avatar
Guolin Ke committed
1145
  }
Guolin Ke's avatar
Guolin Ke committed
1146
  dataset->FinishLoad();
Guolin Ke's avatar
Guolin Ke committed
1147
  // text data can be free after loaded feature values
Guolin Ke's avatar
Guolin Ke committed
1148
  text_data->clear();
Guolin Ke's avatar
Guolin Ke committed
1149
1150
1151
1152
}

/*! \brief Extract local features from file */
void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser* parser, const std::vector<data_size_t>& used_data_indices, Dataset* dataset) {
1153
  std::vector<double> init_score;
Guolin Ke's avatar
Guolin Ke committed
1154
  if (predict_fun_ != nullptr) {
1155
    init_score = std::vector<double>(dataset->num_data_ * num_class_);
Guolin Ke's avatar
Guolin Ke committed
1156
1157
1158
1159
1160
1161
  }
  std::function<void(data_size_t, const std::vector<std::string>&)> process_fun =
    [this, &init_score, &parser, &dataset]
  (data_size_t start_idx, const std::vector<std::string>& lines) {
    std::vector<std::pair<int, double>> oneline_features;
    double tmp_label = 0.0f;
1162
    OMP_INIT_EX();
1163
    #pragma omp parallel for schedule(static) private(oneline_features) firstprivate(tmp_label)
Guolin Ke's avatar
Guolin Ke committed
1164
    for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); ++i) {
1165
      OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
1166
1167
1168
1169
1170
      const int tid = omp_get_thread_num();
      oneline_features.clear();
      // parser
      parser->ParseOneLine(lines[i].c_str(), &oneline_features, &tmp_label);
      // set initial score
Guolin Ke's avatar
Guolin Ke committed
1171
      if (!init_score.empty()) {
Guolin Ke's avatar
Guolin Ke committed
1172
1173
        std::vector<double> oneline_init_score(num_class_);
        predict_fun_(oneline_features, oneline_init_score.data());
1174
        for (int k = 0; k < num_class_; ++k) {
1175
          init_score[k * dataset->num_data_ + start_idx + i] = static_cast<double>(oneline_init_score[k]);
Guolin Ke's avatar
Guolin Ke committed
1176
1177
1178
        }
      }
      // set label
1179
      dataset->metadata_.SetLabelAt(start_idx + i, static_cast<label_t>(tmp_label));
Guolin Ke's avatar
Guolin Ke committed
1180
1181
      // push data
      for (auto& inner_data : oneline_features) {
1182
        if (inner_data.first >= dataset->num_total_features_) { continue; }
Guolin Ke's avatar
Guolin Ke committed
1183
1184
1185
        int feature_idx = dataset->used_feature_map_[inner_data.first];
        if (feature_idx >= 0) {
          // if is used feature
Guolin Ke's avatar
Guolin Ke committed
1186
1187
1188
          int group = dataset->feature2group_[feature_idx];
          int sub_feature = dataset->feature2subfeature_[feature_idx];
          dataset->feature_groups_[group]->PushData(tid, sub_feature, start_idx + i, inner_data.second);
Guolin Ke's avatar
Guolin Ke committed
1189
1190
        } else {
          if (inner_data.first == weight_idx_) {
1191
            dataset->metadata_.SetWeightAt(start_idx + i, static_cast<label_t>(inner_data.second));
Guolin Ke's avatar
Guolin Ke committed
1192
1193
1194
1195
1196
          } else if (inner_data.first == group_idx_) {
            dataset->metadata_.SetQueryAt(start_idx + i, static_cast<data_size_t>(inner_data.second));
          }
        }
      }
1197
      OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
1198
    }
1199
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
1200
  };
Guolin Ke's avatar
Guolin Ke committed
1201
  TextReader<data_size_t> text_reader(filename, config_.header);
Guolin Ke's avatar
Guolin Ke committed
1202
  if (!used_data_indices.empty()) {
Guolin Ke's avatar
Guolin Ke committed
1203
1204
1205
1206
1207
1208
1209
1210
    // only need part of data
    text_reader.ReadPartAndProcessParallel(used_data_indices, process_fun);
  } else {
    // need full data
    text_reader.ReadAllAndProcessParallel(process_fun);
  }

  // metadata_ will manage space of init_score
Guolin Ke's avatar
Guolin Ke committed
1211
  if (!init_score.empty()) {
1212
    dataset->metadata_.SetInitScore(init_score.data(), dataset->num_data_ * num_class_);
Guolin Ke's avatar
Guolin Ke committed
1213
  }
Guolin Ke's avatar
Guolin Ke committed
1214
  dataset->FinishLoad();
Guolin Ke's avatar
Guolin Ke committed
1215
1216
1217
}

/*! \brief Check can load from binary file */
1218
std::string DatasetLoader::CheckCanLoadFromBin(const char* filename) {
Guolin Ke's avatar
Guolin Ke committed
1219
1220
1221
  std::string bin_filename(filename);
  bin_filename.append(".bin");

1222
  auto reader = VirtualFileReader::Make(bin_filename.c_str());
Guolin Ke's avatar
Guolin Ke committed
1223

1224
  if (!reader->Init()) {
1225
    bin_filename = std::string(filename);
1226
1227
    reader = VirtualFileReader::Make(bin_filename.c_str());
    if (!reader->Init()) {
1228
      Log::Fatal("Cannot open data file %s", bin_filename.c_str());
1229
    }
1230
  }
1231
1232
1233
1234
1235

  size_t buffer_size = 256;
  auto buffer = std::vector<char>(buffer_size);
  // read size of token
  size_t size_of_token = std::strlen(Dataset::binary_file_token);
1236
  size_t read_cnt = reader->Read(buffer.data(), size_of_token);
1237
1238
  if (read_cnt == size_of_token
      && std::string(buffer.data()) == std::string(Dataset::binary_file_token)) {
1239
    return bin_filename;
Guolin Ke's avatar
Guolin Ke committed
1240
  } else {
1241
    return std::string();
Guolin Ke's avatar
Guolin Ke committed
1242
1243
1244
  }
}

1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282


std::vector<std::vector<double>> DatasetLoader::GetForcedBins(std::string forced_bins_path, int num_total_features,
                                                              const std::unordered_set<int>& categorical_features) {
  std::vector<std::vector<double>> forced_bins(num_total_features, std::vector<double>());
  if (forced_bins_path != "") {
    std::ifstream forced_bins_stream(forced_bins_path.c_str());
    if (forced_bins_stream.fail()) {
      Log::Warning("Could not open %s. Will ignore.", forced_bins_path.c_str());
    } else {
      std::stringstream buffer;
      buffer << forced_bins_stream.rdbuf();
      std::string err;
      Json forced_bins_json = Json::parse(buffer.str(), err);
      CHECK(forced_bins_json.is_array());
      std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
      for (size_t i = 0; i < forced_bins_arr.size(); ++i) {
        int feature_num = forced_bins_arr[i]["feature"].int_value();
        CHECK(feature_num < num_total_features);
        if (categorical_features.count(feature_num)) {
          Log::Warning("Feature %d is categorical. Will ignore forced bins for this  feature.", feature_num);
        } else {
          std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
          for (size_t j = 0; j < bounds_arr.size(); ++j) {
            forced_bins[feature_num].push_back(bounds_arr[j].number_value());
          }
        }
      }
      // remove duplicates
      for (int i = 0; i < num_total_features; ++i) {
        auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
        forced_bins[i].erase(new_end, forced_bins[i].end());
      }
    }
  }
  return forced_bins;
}

1283
}  // namespace LightGBM