dataset.cpp 5.95 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
8
9
10
11
12
#include <LightGBM/dataset.h>

#include <LightGBM/feature.h>

#include <omp.h>

#include <cstdio>
#include <unordered_map>
#include <limits>
#include <vector>
#include <utility>
#include <string>
Guolin Ke's avatar
Guolin Ke committed
13
#include <sstream>
Guolin Ke's avatar
Guolin Ke committed
14
15
16
17

namespace LightGBM {


Guolin Ke's avatar
Guolin Ke committed
18
19
20
21
Dataset::Dataset() {
  num_class_ = 1;
  num_data_ = 0;
  is_loading_from_binfile_ = false;
Guolin Ke's avatar
Guolin Ke committed
22
23
}

24
25
Dataset::Dataset(data_size_t num_data, int num_class) {
  num_class_ = num_class;
Guolin Ke's avatar
Guolin Ke committed
26
27
  num_data_ = num_data;
  is_loading_from_binfile_ = false;
28
  metadata_.Init(num_data_, num_class_, -1, -1);
Guolin Ke's avatar
Guolin Ke committed
29
30
}

Guolin Ke's avatar
Guolin Ke committed
31
32
33
34
35
36
37
Dataset::~Dataset() {
  for (auto& feature : features_) {
    delete feature;
  }
  features_.clear();
}

Guolin Ke's avatar
Guolin Ke committed
38
39
40
41
42
43
void Dataset::FinishLoad() {
#pragma omp parallel for schedule(guided)
  for (int i = 0; i < num_features_; ++i) {
    features_[i]->FinishLoad();
  }
}
Guolin Ke's avatar
Guolin Ke committed
44

Guolin Ke's avatar
Guolin Ke committed
45
void Dataset::CopyFeatureBinMapperTo(Dataset* dataset, bool is_enable_sparse) const {
Guolin Ke's avatar
Guolin Ke committed
46
47
48
49
50
  dataset->features_.clear();
  // copy feature bin mapper data
  for (Feature* feature : features_) {
    dataset->features_.push_back(new Feature(feature->feature_index(),
      new BinMapper(*feature->bin_mapper()), dataset->num_data_, is_enable_sparse));
Guolin Ke's avatar
Guolin Ke committed
51
  }
Guolin Ke's avatar
Guolin Ke committed
52
  dataset->num_class_ = num_class_;
Guolin Ke's avatar
Guolin Ke committed
53
54
55
56
  dataset->used_feature_map_ = used_feature_map_;
  dataset->num_features_ = static_cast<int>(dataset->features_.size());
  dataset->num_total_features_ = num_total_features_;
  dataset->feature_names_ = feature_names_;
Guolin Ke's avatar
Guolin Ke committed
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
}

std::vector<const BinMapper*> Dataset::GetBinMappers() const {
  std::vector<const BinMapper*> ret(num_total_features_, nullptr);
  for (const auto feature : features_) {
    ret[feature->feature_index()] = feature->bin_mapper();
  }
  return ret;
}

void Dataset::SetField(const char* field_name, const void* field_data, data_size_t num_element, int type) {
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("label") || name == std::string("target")) {
    if (type != 0) {
      Log::Fatal("type of label should be float");
    }
74
    metadata_.SetLabel(reinterpret_cast<const float*>(field_data), num_element);
Guolin Ke's avatar
Guolin Ke committed
75
  } else if (name == std::string("weight") || name == std::string("weights")) {
Guolin Ke's avatar
Guolin Ke committed
76
77
78
    if (type != 0) {
      Log::Fatal("type of weights should be float");
    }
79
    metadata_.SetWeights(reinterpret_cast<const float*>(field_data), num_element);
Guolin Ke's avatar
Guolin Ke committed
80
81
82
83
  } else if (name == std::string("init_score")) {
    if (type != 0) {
      Log::Fatal("type of init_score should be float");
    }
84
    metadata_.SetInitScore(reinterpret_cast<const float*>(field_data), num_element);
Guolin Ke's avatar
Guolin Ke committed
85
86
87
88
  } else if (name == std::string("query") || name == std::string("group")) {
    if (type != 1) {
      Log::Fatal("type of init_score should be int");
    }
89
    metadata_.SetQueryBoundaries(reinterpret_cast<const data_size_t*>(field_data), num_element);
Guolin Ke's avatar
Guolin Ke committed
90
91
92
93
94
  } else {
    Log::Fatal("unknow field name: %s", field_name);
  }
}

95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
void Dataset::GetField(const char* field_name, uint64_t* out_len, const void** out_ptr, int* out_type) {
  std::string name(field_name);
  name = Common::Trim(name);
  if (name == std::string("label") || name == std::string("target")) {
    *out_ptr = metadata_.label();
    *out_len = num_data_;
    *out_type = 0;
  } else if (name == std::string("weight") || name == std::string("weights")) {
    *out_ptr = metadata_.weights();
    *out_len = num_data_;
    *out_type = 0;
  } else if (name == std::string("init_score")) {
    *out_ptr = metadata_.init_score();
    *out_len = num_data_;
    *out_type = 0;
  } else if (name == std::string("query") || name == std::string("group")) {
    *out_ptr = metadata_.query_boundaries();
    *out_len = num_data_;
    *out_type = 1;
  } else {
    Log::Fatal("unknow field name: %s", field_name);
  }

}

Guolin Ke's avatar
Guolin Ke committed
120
void Dataset::SaveBinaryFile(const char* bin_filename) {
Guolin Ke's avatar
Guolin Ke committed
121

Guolin Ke's avatar
Guolin Ke committed
122
  if (!is_loading_from_binfile_) {
Guolin Ke's avatar
Guolin Ke committed
123
124
125
126
127
128
    // if not pass a filename, just append ".bin" of original file
    if (bin_filename == nullptr || bin_filename[0] == '\0') {
      std::string bin_filename_str(data_filename_);
      bin_filename_str.append(".bin");
      bin_filename = bin_filename_str.c_str();
    }
Guolin Ke's avatar
Guolin Ke committed
129
    FILE* file;
Guolin Ke's avatar
Guolin Ke committed
130
#ifdef _MSC_VER
Guolin Ke's avatar
Guolin Ke committed
131
    fopen_s(&file, bin_filename, "wb");
Guolin Ke's avatar
Guolin Ke committed
132
#else
Guolin Ke's avatar
Guolin Ke committed
133
    file = fopen(bin_filename, "wb");
Guolin Ke's avatar
Guolin Ke committed
134
#endif
Guolin Ke's avatar
Guolin Ke committed
135
    if (file == NULL) {
Guolin Ke's avatar
Guolin Ke committed
136
      Log::Fatal("Cannot write binary data to %s ", bin_filename);
Guolin Ke's avatar
Guolin Ke committed
137
138
    }

139
    Log::Info("Saving data to binary file %s", data_filename_);
Guolin Ke's avatar
Guolin Ke committed
140
141

    // get size of header
Guolin Ke's avatar
Guolin Ke committed
142
    size_t size_of_header = sizeof(num_data_) + sizeof(num_class_) + sizeof(num_features_) + sizeof(num_total_features_) 
Guolin Ke's avatar
Guolin Ke committed
143
      + sizeof(size_t) + sizeof(int) * used_feature_map_.size();
144
145
146
147
    // size of feature names
    for (int i = 0; i < num_total_features_; ++i) {
      size_of_header += feature_names_[i].size() + sizeof(int);
    }
Guolin Ke's avatar
Guolin Ke committed
148
149
150
    fwrite(&size_of_header, sizeof(size_of_header), 1, file);
    // write header
    fwrite(&num_data_, sizeof(num_data_), 1, file);
Guolin Ke's avatar
Guolin Ke committed
151
    fwrite(&num_class_, sizeof(num_class_), 1, file);
Guolin Ke's avatar
Guolin Ke committed
152
    fwrite(&num_features_, sizeof(num_features_), 1, file);
153
    fwrite(&num_total_features_, sizeof(num_features_), 1, file);
Guolin Ke's avatar
Guolin Ke committed
154
155
156
157
    size_t num_used_feature_map = used_feature_map_.size();
    fwrite(&num_used_feature_map, sizeof(num_used_feature_map), 1, file);
    fwrite(used_feature_map_.data(), sizeof(int), num_used_feature_map, file);

158
159
160
161
162
163
164
165
    // write feature names
    for (int i = 0; i < num_total_features_; ++i) {
      int str_len = static_cast<int>(feature_names_[i].size());
      fwrite(&str_len, sizeof(int), 1, file);
      const char* c_str = feature_names_[i].c_str();
      fwrite(c_str, sizeof(char), str_len, file);
    }

Guolin Ke's avatar
Guolin Ke committed
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
    // get size of meta data
    size_t size_of_metadata = metadata_.SizesInByte();
    fwrite(&size_of_metadata, sizeof(size_of_metadata), 1, file);
    // write meta data
    metadata_.SaveBinaryToFile(file);

    // write feature data
    for (int i = 0; i < num_features_; ++i) {
      // get size of feature
      size_t size_of_feature = features_[i]->SizesInByte();
      fwrite(&size_of_feature, sizeof(size_of_feature), 1, file);
      // write feature
      features_[i]->SaveBinaryToFile(file);
    }
    fclose(file);
  }
}

}  // namespace LightGBM