dataset.cpp 7.39 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
#include <LightGBM/dataset.h>

#include <LightGBM/feature.h>

5
#include <LightGBM/utils/openmp_wrapper.h>
Guolin Ke's avatar
Guolin Ke committed
6
7
8
9
10
11
12

#include <cstdio>
#include <unordered_map>
#include <limits>
#include <vector>
#include <utility>
#include <string>
Guolin Ke's avatar
Guolin Ke committed
13
#include <sstream>
Guolin Ke's avatar
Guolin Ke committed
14
15
16

namespace LightGBM {

17
const char* Dataset::binary_file_token = "______LightGBM_Binary_File_Token______\n";
Guolin Ke's avatar
Guolin Ke committed
18

Guolin Ke's avatar
Guolin Ke committed
19
Dataset::Dataset() {
20
  data_filename_ = "noname";
Guolin Ke's avatar
Guolin Ke committed
21
  num_data_ = 0;
Guolin Ke's avatar
Guolin Ke committed
22
23
}

24
Dataset::Dataset(data_size_t num_data) {
Guolin Ke's avatar
Guolin Ke committed
25
  data_filename_ = "noname";
Guolin Ke's avatar
Guolin Ke committed
26
  num_data_ = num_data;
27
  metadata_.Init(num_data_, -1, -1);
Guolin Ke's avatar
Guolin Ke committed
28
29
}

Guolin Ke's avatar
Guolin Ke committed
30
Dataset::~Dataset() {
Guolin Ke's avatar
Guolin Ke committed
31

Guolin Ke's avatar
Guolin Ke committed
32
33
}

Guolin Ke's avatar
Guolin Ke committed
34
35
36
37
38
39
void Dataset::FinishLoad() {
#pragma omp parallel for schedule(guided)
  for (int i = 0; i < num_features_; ++i) {
    features_[i]->FinishLoad();
  }
}
Guolin Ke's avatar
Guolin Ke committed
40

Guolin Ke's avatar
Guolin Ke committed
41
42
void Dataset::CopyFeatureMapperFrom(const Dataset* dataset, bool is_enable_sparse) {
  features_.clear();
Guolin Ke's avatar
Guolin Ke committed
43
  num_features_ = dataset->num_features_;
Guolin Ke's avatar
Guolin Ke committed
44
  // copy feature bin mapper data
Guolin Ke's avatar
Guolin Ke committed
45
46
47
48
49
  for(int i = 0;i < num_features_;++i){
    features_.emplace_back(new Feature(dataset->features_[i]->feature_index(),
      new BinMapper(*(dataset->features_[i]->bin_mapper())),
      num_data_,
      is_enable_sparse));
Guolin Ke's avatar
Guolin Ke committed
50
  }
Guolin Ke's avatar
Guolin Ke committed
51
  features_.shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
52
53
54
  used_feature_map_ = dataset->used_feature_map_;
  num_total_features_ = dataset->num_total_features_;
  feature_names_ = dataset->feature_names_;
Guolin Ke's avatar
Guolin Ke committed
55
  label_idx_ = dataset->label_idx_;
Guolin Ke's avatar
Guolin Ke committed
56
57
}

Guolin Ke's avatar
Guolin Ke committed
58
59
60
61
62
63
64
65
66
67
68
69
void Dataset::ReSize(data_size_t num_data) {
  if (num_data_ != num_data) {
    num_data_ = num_data;
#pragma omp parallel for schedule(guided)
    for (int fidx = 0; fidx < num_features_; ++fidx) {
      features_[fidx]->ReSize(num_data_);
    }
  }
}

void Dataset::CopySubset(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data) {
  CHECK(num_used_indices == num_data_);
Guolin Ke's avatar
Guolin Ke committed
70
71
#pragma omp parallel for schedule(guided)
  for (int fidx = 0; fidx < num_features_; ++fidx) {
Guolin Ke's avatar
Guolin Ke committed
72
    auto iterator = fullset->features_[fidx]->bin_data()->GetIterator(used_indices[0]);
Guolin Ke's avatar
Guolin Ke committed
73
    for (data_size_t i = 0; i < num_used_indices; ++i) {
Guolin Ke's avatar
Guolin Ke committed
74
      features_[fidx]->PushBin(0, i, iterator->Get(used_indices[i]));
Guolin Ke's avatar
Guolin Ke committed
75
76
    }
  }
Guolin Ke's avatar
Guolin Ke committed
77
  if (need_meta_data) {
Guolin Ke's avatar
Guolin Ke committed
78
    metadata_.Init(metadata_, used_indices, num_used_indices);
Guolin Ke's avatar
Guolin Ke committed
79
  }
Guolin Ke's avatar
Guolin Ke committed
80
  FinishLoad();
Guolin Ke's avatar
Guolin Ke committed
81
82
}

83
bool Dataset::SetFloatField(const char* field_name, const float* field_data, data_size_t num_element) {
Guolin Ke's avatar
Guolin Ke committed
84
85
86
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("label") || name == std::string("target")) {
87
    metadata_.SetLabel(field_data, num_element);
Guolin Ke's avatar
Guolin Ke committed
88
  } else if (name == std::string("weight") || name == std::string("weights")) {
89
    metadata_.SetWeights(field_data, num_element);
Guolin Ke's avatar
Guolin Ke committed
90
91
92
93
94
95
96
97
98
99
  } else {
    return false;
  }
  return true;
}

bool Dataset::SetDoubleField(const char* field_name, const double* field_data, data_size_t num_element) {
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("init_score")) {
100
    metadata_.SetInitScore(field_data, num_element);
Guolin Ke's avatar
Guolin Ke committed
101
  } else {
102
    return false;
Guolin Ke's avatar
Guolin Ke committed
103
  }
104
  return true;
Guolin Ke's avatar
Guolin Ke committed
105
106
}

107
108
109
110
bool Dataset::SetIntField(const char* field_name, const int* field_data, data_size_t num_element) {
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("query") || name == std::string("group")) {
Guolin Ke's avatar
Guolin Ke committed
111
    metadata_.SetQuery(field_data, num_element);
112
113
  } else if (name == std::string("query_id") || name == std::string("group_id")) {
    metadata_.SetQueryId(field_data, num_element);
114
115
116
117
118
119
  } else {
    return false;
  }
  return true;
}

Guolin Ke's avatar
Guolin Ke committed
120
bool Dataset::GetFloatField(const char* field_name, data_size_t* out_len, const float** out_ptr) {
121
122
123
124
125
126
127
128
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("label") || name == std::string("target")) {
    *out_ptr = metadata_.label();
    *out_len = num_data_;
  } else if (name == std::string("weight") || name == std::string("weights")) {
    *out_ptr = metadata_.weights();
    *out_len = num_data_;
Guolin Ke's avatar
Guolin Ke committed
129
130
131
132
133
134
135
136
137
138
  } else {
    return false;
  }
  return true;
}

bool Dataset::GetDoubleField(const char* field_name, data_size_t* out_len, const double** out_ptr) {
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("init_score")) {
139
    *out_ptr = metadata_.init_score();
Guolin Ke's avatar
Guolin Ke committed
140
    *out_len = static_cast<data_size_t>(metadata_.num_init_score());
141
142
143
  } else {
    return false;
  }
144
  return true;
145
146
}

Guolin Ke's avatar
Guolin Ke committed
147
bool Dataset::GetIntField(const char* field_name, data_size_t* out_len, const int** out_ptr) {
148
149
150
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("query") || name == std::string("group")) {
151
    *out_ptr = metadata_.query_boundaries();
Guolin Ke's avatar
Guolin Ke committed
152
    *out_len = metadata_.num_queries() + 1;
Guolin Ke's avatar
Guolin Ke committed
153
154
155
  } else {
    return false;
  }
156
  return true;
157
158
}

Guolin Ke's avatar
Guolin Ke committed
159
void Dataset::SaveBinaryFile(const char* bin_filename) {
Guolin Ke's avatar
Guolin Ke committed
160
161
162
163
164
  if (bin_filename != nullptr 
      && std::string(bin_filename) == std::string(data_filename_)) {
    Log::Warning("Bianry file %s already existed", bin_filename);
    return;
  }
Guolin Ke's avatar
Guolin Ke committed
165
  // if not pass a filename, just append ".bin" of original file
Guolin Ke's avatar
Guolin Ke committed
166
  std::string bin_filename_str(data_filename_);
Guolin Ke's avatar
Guolin Ke committed
167
168
169
170
  if (bin_filename == nullptr || bin_filename[0] == '\0') {
    bin_filename_str.append(".bin");
    bin_filename = bin_filename_str.c_str();
  }
Guolin Ke's avatar
Guolin Ke committed
171
172
173
174
175
176
177
178
179
180
181
182
183
  bool is_file_existed = false;
  FILE* file;
#ifdef _MSC_VER
  fopen_s(&file, bin_filename, "rb");
#else
  file = fopen(bin_filename, "rb");
#endif

  if (file != NULL) {
    is_file_existed = true;
    Log::Warning("File %s existed, cannot save binary to it", bin_filename);
    fclose(file);
  }
Guolin Ke's avatar
Guolin Ke committed
184

Guolin Ke's avatar
Guolin Ke committed
185
  if (!is_file_existed) {
Guolin Ke's avatar
Guolin Ke committed
186
#ifdef _MSC_VER
Guolin Ke's avatar
Guolin Ke committed
187
    fopen_s(&file, bin_filename, "wb");
Guolin Ke's avatar
Guolin Ke committed
188
#else
Guolin Ke's avatar
Guolin Ke committed
189
    file = fopen(bin_filename, "wb");
Guolin Ke's avatar
Guolin Ke committed
190
#endif
Guolin Ke's avatar
Guolin Ke committed
191
    if (file == NULL) {
Guolin Ke's avatar
Guolin Ke committed
192
      Log::Fatal("Cannot write binary data to %s ", bin_filename);
Guolin Ke's avatar
Guolin Ke committed
193
    }
194
    Log::Info("Saving data to binary file %s", bin_filename);
195
196
    size_t size_of_token = std::strlen(binary_file_token);
    fwrite(binary_file_token, sizeof(char), size_of_token, file);
Guolin Ke's avatar
Guolin Ke committed
197
    // get size of header
198
    size_t size_of_header = sizeof(num_data_) + sizeof(num_features_) + sizeof(num_total_features_) 
Guolin Ke's avatar
Guolin Ke committed
199
      + sizeof(size_t) + sizeof(int) * used_feature_map_.size();
200
201
202
203
    // size of feature names
    for (int i = 0; i < num_total_features_; ++i) {
      size_of_header += feature_names_[i].size() + sizeof(int);
    }
Guolin Ke's avatar
Guolin Ke committed
204
205
206
207
    fwrite(&size_of_header, sizeof(size_of_header), 1, file);
    // write header
    fwrite(&num_data_, sizeof(num_data_), 1, file);
    fwrite(&num_features_, sizeof(num_features_), 1, file);
208
    fwrite(&num_total_features_, sizeof(num_features_), 1, file);
Guolin Ke's avatar
Guolin Ke committed
209
210
211
212
    size_t num_used_feature_map = used_feature_map_.size();
    fwrite(&num_used_feature_map, sizeof(num_used_feature_map), 1, file);
    fwrite(used_feature_map_.data(), sizeof(int), num_used_feature_map, file);

213
214
215
216
217
218
219
220
    // write feature names
    for (int i = 0; i < num_total_features_; ++i) {
      int str_len = static_cast<int>(feature_names_[i].size());
      fwrite(&str_len, sizeof(int), 1, file);
      const char* c_str = feature_names_[i].c_str();
      fwrite(c_str, sizeof(char), str_len, file);
    }

Guolin Ke's avatar
Guolin Ke committed
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
    // get size of meta data
    size_t size_of_metadata = metadata_.SizesInByte();
    fwrite(&size_of_metadata, sizeof(size_of_metadata), 1, file);
    // write meta data
    metadata_.SaveBinaryToFile(file);

    // write feature data
    for (int i = 0; i < num_features_; ++i) {
      // get size of feature
      size_t size_of_feature = features_[i]->SizesInByte();
      fwrite(&size_of_feature, sizeof(size_of_feature), 1, file);
      // write feature
      features_[i]->SaveBinaryToFile(file);
    }
    fclose(file);
  }
}

}  // namespace LightGBM