common.h 11.4 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
8
9
10
#ifndef LIGHTGBM_UTILS_COMMON_FUN_H_
#define LIGHTGBM_UTILS_COMMON_FUN_H_

#include <LightGBM/utils/log.h>

#include <cstdio>
#include <string>
#include <vector>
#include <sstream>
#include <cstdint>
11
#include <algorithm>
12
#include <cmath>
Guolin Ke's avatar
Guolin Ke committed
13
#include <functional>
Guolin Ke's avatar
Guolin Ke committed
14
#include <memory>
Guolin Ke's avatar
Guolin Ke committed
15
#include <type_traits>
16
#include <iomanip>
Guolin Ke's avatar
Guolin Ke committed
17
18
19
20
21

namespace LightGBM {

namespace Common {

Guolin Ke's avatar
Guolin Ke committed
22
23
24
25
26
27
inline char tolower(char in) {
  if (in <= 'Z' && in >= 'A')
    return in - ('Z' - 'z');
  return in;
}

Guolin Ke's avatar
Guolin Ke committed
28
inline static std::string& Trim(std::string& str) {
Guolin Ke's avatar
Guolin Ke committed
29
  if (str.empty()) {
Guolin Ke's avatar
Guolin Ke committed
30
31
32
33
34
35
36
    return str;
  }
  str.erase(str.find_last_not_of(" \f\n\r\t\v") + 1);
  str.erase(0, str.find_first_not_of(" \f\n\r\t\v"));
  return str;
}

37
inline static std::string& RemoveQuotationSymbol(std::string& str) {
Guolin Ke's avatar
Guolin Ke committed
38
  if (str.empty()) {
39
40
41
42
43
44
    return str;
  }
  str.erase(str.find_last_not_of("'\"") + 1);
  str.erase(0, str.find_first_not_of("'\""));
  return str;
}
Guolin Ke's avatar
Guolin Ke committed
45
46
47
48
49
50
51
inline static bool StartsWith(const std::string& str, const std::string prefix) {
  if (str.substr(0, prefix.size()) == prefix) {
    return true;
  } else {
    return false;
  }
}
Guolin Ke's avatar
Guolin Ke committed
52
inline static std::vector<std::string> Split(const char* c_str, char delimiter) {
Guolin Ke's avatar
Guolin Ke committed
53
  std::vector<std::string> ret;
Guolin Ke's avatar
Guolin Ke committed
54
55
56
57
58
59
60
  std::string str(c_str);
  size_t i = 0;
  size_t pos = str.find(delimiter);
  while (pos != std::string::npos) {
    ret.push_back(str.substr(i, pos - i));
    i = ++pos;
    pos = str.find(delimiter, pos);
Guolin Ke's avatar
Guolin Ke committed
61
  }
Guolin Ke's avatar
Guolin Ke committed
62
  ret.push_back(str.substr(i));
Guolin Ke's avatar
Guolin Ke committed
63
64
65
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
inline static std::vector<std::string> Split(const char* c_str, const char* delimiters) {
  // will split when met any chars in delimiters
  std::vector<std::string> ret;
  std::string str(c_str);
  size_t i = 0;
  size_t pos = str.find_first_of(delimiters);
  while (pos != std::string::npos) {
    ret.push_back(str.substr(i, pos - i));
    i = ++pos;
    pos = str.find_first_of(delimiters, pos);
  }
  ret.push_back(str.substr(i));
  return ret;
}

81
82
83
84
85
86
inline static std::string FindFromLines(const std::vector<std::string>& lines, const char* key_word) {
  for (auto& line : lines) {
    size_t find_pos = line.find(key_word);
    if (find_pos != std::string::npos) {
      return line;
    }
87
  }
88
  return "";
89
90
}

Guolin Ke's avatar
Guolin Ke committed
91
92
93
94
95
96
97
98
99
inline static const char* Atoi(const char* p, int* out) {
  int sign, value;
  while (*p == ' ') {
    ++p;
  }
  sign = 1;
  if (*p == '-') {
    sign = -1;
    ++p;
100
  } else if (*p == '+') {
Guolin Ke's avatar
Guolin Ke committed
101
102
103
104
105
106
107
108
109
110
111
112
    ++p;
  }
  for (value = 0; *p >= '0' && *p <= '9'; ++p) {
    value = value * 10 + (*p - '0');
  }
  *out = sign * value;
  while (*p == ' ') {
    ++p;
  }
  return p;
}

113
inline static const char* Atof(const char* p, double* out) {
Guolin Ke's avatar
Guolin Ke committed
114
  int frac;
115
  double sign, value, scale;
Guolin Ke's avatar
Guolin Ke committed
116
  *out = 0;
Guolin Ke's avatar
Guolin Ke committed
117
118
119
120
121
122
  // Skip leading white space, if any.
  while (*p == ' ') {
    ++p;
  }

  // Get sign, if any.
123
  sign = 1.0;
Guolin Ke's avatar
Guolin Ke committed
124
  if (*p == '-') {
125
    sign = -1.0;
Guolin Ke's avatar
Guolin Ke committed
126
    ++p;
127
  } else if (*p == '+') {
Guolin Ke's avatar
Guolin Ke committed
128
129
130
    ++p;
  }

Guolin Ke's avatar
Guolin Ke committed
131
132
133
  // is a number
  if ((*p >= '0' && *p <= '9') || *p == '.' || *p == 'e' || *p == 'E') {
    // Get digits before decimal point or exponent, if any.
134
135
    for (value = 0.0; *p >= '0' && *p <= '9'; ++p) {
      value = value * 10.0 + (*p - '0');
Guolin Ke's avatar
Guolin Ke committed
136
    }
Guolin Ke's avatar
Guolin Ke committed
137

Guolin Ke's avatar
Guolin Ke committed
138
139
    // Get digits after decimal point, if any.
    if (*p == '.') {
140
      double pow10 = 10.0;
Guolin Ke's avatar
Guolin Ke committed
141
      ++p;
Guolin Ke's avatar
Guolin Ke committed
142
143
      while (*p >= '0' && *p <= '9') {
        value += (*p - '0') / pow10;
144
        pow10 *= 10.0;
Guolin Ke's avatar
Guolin Ke committed
145
146
        ++p;
      }
Guolin Ke's avatar
Guolin Ke committed
147
148
    }

Guolin Ke's avatar
Guolin Ke committed
149
150
    // Handle exponent, if any.
    frac = 0;
151
    scale = 1.0;
Guolin Ke's avatar
Guolin Ke committed
152
    if ((*p == 'e') || (*p == 'E')) {
Guolin Ke's avatar
Guolin Ke committed
153
      uint32_t expon;
Guolin Ke's avatar
Guolin Ke committed
154
      // Get sign of exponent, if any.
Guolin Ke's avatar
Guolin Ke committed
155
      ++p;
Guolin Ke's avatar
Guolin Ke committed
156
157
158
159
160
161
162
163
164
165
      if (*p == '-') {
        frac = 1;
        ++p;
      } else if (*p == '+') {
        ++p;
      }
      // Get digits of exponent, if any.
      for (expon = 0; *p >= '0' && *p <= '9'; ++p) {
        expon = expon * 10 + (*p - '0');
      }
166
167
168
      if (expon > 308) expon = 308;
      // Calculate scaling factor.
      while (expon >= 50) { scale *= 1E50; expon -= 50; }
Guolin Ke's avatar
Guolin Ke committed
169
      while (expon >= 8) { scale *= 1E8;  expon -= 8; }
170
      while (expon > 0) { scale *= 10.0; expon -= 1; }
Guolin Ke's avatar
Guolin Ke committed
171
    }
Guolin Ke's avatar
Guolin Ke committed
172
173
174
    // Return signed and scaled floating point result.
    *out = sign * (frac ? (value / scale) : (value * scale));
  } else {
175
    size_t cnt = 0;
176
    while (*(p + cnt) != '\0' && *(p + cnt) != ' '
177
178
      && *(p + cnt) != '\t' && *(p + cnt) != ','
      && *(p + cnt) != '\n' && *(p + cnt) != '\r'
179
      && *(p + cnt) != ':') {
180
181
      ++cnt;
    }
182
    if (cnt > 0) {
Guolin Ke's avatar
Guolin Ke committed
183
      std::string tmp_str(p, cnt);
Guolin Ke's avatar
Guolin Ke committed
184
      std::transform(tmp_str.begin(), tmp_str.end(), tmp_str.begin(), Common::tolower);
Guolin Ke's avatar
Guolin Ke committed
185
      if (tmp_str == std::string("na") || tmp_str == std::string("nan")) {
186
        *out = 0;
187
      } else if (tmp_str == std::string("inf") || tmp_str == std::string("infinity")) {
188
        *out = sign * 1e308;
189
      } else {
190
        Log::Fatal("Unknown token %s in data file", tmp_str.c_str());
Guolin Ke's avatar
Guolin Ke committed
191
192
      }
      p += cnt;
193
    }
Guolin Ke's avatar
Guolin Ke committed
194
  }
Guolin Ke's avatar
Guolin Ke committed
195

Guolin Ke's avatar
Guolin Ke committed
196
197
198
  while (*p == ' ') {
    ++p;
  }
Guolin Ke's avatar
Guolin Ke committed
199

Guolin Ke's avatar
Guolin Ke committed
200
201
202
  return p;
}

203
204


205
206
207
208
209
210
211
212
inline bool AtoiAndCheck(const char* p, int* out) {
  const char* after = Atoi(p, out);
  if (*after != '\0') {
    return false;
  }
  return true;
}

213
inline bool AtofAndCheck(const char* p, double* out) {
214
215
216
217
218
219
220
  const char* after = Atof(p, out);
  if (*after != '\0') {
    return false;
  }
  return true;
}

Guolin Ke's avatar
Guolin Ke committed
221
222
223
224
225
226
227
228
229
230
231
232
233
234
inline static const char* SkipSpaceAndTab(const char* p) {
  while (*p == ' ' || *p == '\t') {
    ++p;
  }
  return p;
}

inline static const char* SkipReturn(const char* p) {
  while (*p == '\n' || *p == '\r' || *p == ' ') {
    ++p;
  }
  return p;
}

Guolin Ke's avatar
Guolin Ke committed
235
236
237
238
239
template<typename T, typename T2>
inline static std::vector<T2> ArrayCast(const std::vector<T>& arr) {
  std::vector<T2> ret;
  for (size_t i = 0; i < arr.size(); ++i) {
    ret.push_back(static_cast<T2>(arr[i]));
Guolin Ke's avatar
Guolin Ke committed
240
  }
Guolin Ke's avatar
Guolin Ke committed
241
  return ret;
Guolin Ke's avatar
Guolin Ke committed
242
243
}

244
template<typename T>
Guolin Ke's avatar
Guolin Ke committed
245
inline static std::string ArrayToString(const std::vector<T>& arr, char delimiter) {
Guolin Ke's avatar
Guolin Ke committed
246
  if (arr.empty()) {
247
    return std::string("");
Guolin Ke's avatar
Guolin Ke committed
248
  }
249
  std::stringstream str_buf;
250
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
251
252
253
254
  str_buf << arr[0];
  for (size_t i = 1; i < arr.size(); ++i) {
    str_buf << delimiter;
    str_buf << arr[i];
Guolin Ke's avatar
Guolin Ke committed
255
  }
256
  return str_buf.str();
Guolin Ke's avatar
Guolin Ke committed
257
258
}

Guolin Ke's avatar
Guolin Ke committed
259
260
261
262
263
264
template<typename T>
inline static std::string ArrayToString(const std::vector<T>& arr, size_t n, char delimiter) {
  if (arr.empty() || n == 0) {
    return std::string("");
  }
  std::stringstream str_buf;
265
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
Guolin Ke's avatar
Guolin Ke committed
266
267
268
269
270
271
272
273
  str_buf << arr[0];
  for (size_t i = 1; i < std::min(n, arr.size()); ++i) {
    str_buf << delimiter;
    str_buf << arr[i];
  }
  return str_buf.str();
}

Guolin Ke's avatar
Guolin Ke committed
274
275
template<typename T>
inline static std::vector<T> StringToArray(const std::string& str, char delimiter, size_t n) {
Guolin Ke's avatar
Guolin Ke committed
276
277
278
  if (n == 0) {
    return std::vector<T>();
  }
Guolin Ke's avatar
Guolin Ke committed
279
280
  std::vector<std::string> strs = Split(str.c_str(), delimiter);
  if (strs.size() != n) {
Guolin Ke's avatar
Guolin Ke committed
281
    Log::Fatal("StringToArray error, size doesn't match.");
Guolin Ke's avatar
Guolin Ke committed
282
  }
Guolin Ke's avatar
Guolin Ke committed
283
284
285
286
287
288
289
290
291
  std::vector<T> ret(n);
  if (std::is_same<T, float>::value || std::is_same<T, double>::value) {
    for (size_t i = 0; i < n; ++i) {
      ret[i] = static_cast<T>(std::stod(strs[i]));
    }
  } else {
    for (size_t i = 0; i < n; ++i) {
      ret[i] = static_cast<T>(std::stol(strs[i]));
    }
Guolin Ke's avatar
Guolin Ke committed
292
293
294
295
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
296
297
template<typename T>
inline static std::vector<T> StringToArray(const std::string& str, char delimiter) {
Guolin Ke's avatar
Guolin Ke committed
298
  std::vector<std::string> strs = Split(str.c_str(), delimiter);
Guolin Ke's avatar
Guolin Ke committed
299
300
301
302
303
304
305
306
307
  std::vector<T> ret;
  if (std::is_same<T, float>::value || std::is_same<T, double>::value) {
    for (const auto& s : strs) {
      ret.push_back(static_cast<T>(std::stod(s)));
    }
  } else {
    for (const auto& s : strs) {
      ret.push_back(static_cast<T>(std::stol(s)));
    }
Guolin Ke's avatar
Guolin Ke committed
308
309
310
311
  }
  return ret;
}

312
template<typename T>
Guolin Ke's avatar
Guolin Ke committed
313
inline static std::string Join(const std::vector<T>& strs, const char* delimiter) {
Guolin Ke's avatar
Guolin Ke committed
314
  if (strs.empty()) {
Guolin Ke's avatar
Guolin Ke committed
315
316
    return std::string("");
  }
317
  std::stringstream str_buf;
318
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
319
  str_buf << strs[0];
Guolin Ke's avatar
Guolin Ke committed
320
  for (size_t i = 1; i < strs.size(); ++i) {
321
322
    str_buf << delimiter;
    str_buf << strs[i];
Guolin Ke's avatar
Guolin Ke committed
323
  }
324
  return str_buf.str();
Guolin Ke's avatar
Guolin Ke committed
325
326
}

327
template<typename T>
Guolin Ke's avatar
Guolin Ke committed
328
inline static std::string Join(const std::vector<T>& strs, size_t start, size_t end, const char* delimiter) {
Guolin Ke's avatar
Guolin Ke committed
329
330
331
  if (end - start <= 0) {
    return std::string("");
  }
Guolin Ke's avatar
Guolin Ke committed
332
333
  start = std::min(start, static_cast<size_t>(strs.size()) - 1);
  end = std::min(end, static_cast<size_t>(strs.size()));
334
  std::stringstream str_buf;
335
  str_buf << std::setprecision(std::numeric_limits<double>::digits10 + 2);
336
  str_buf << strs[start];
Guolin Ke's avatar
Guolin Ke committed
337
  for (size_t i = start + 1; i < end; ++i) {
338
339
    str_buf << delimiter;
    str_buf << strs[i];
Guolin Ke's avatar
Guolin Ke committed
340
  }
341
  return str_buf.str();
Guolin Ke's avatar
Guolin Ke committed
342
343
344
345
346
347
348
349
350
351
352
353
354
}

static inline int64_t Pow2RoundUp(int64_t x) {
  int64_t t = 1;
  for (int i = 0; i < 64; ++i) {
    if (t >= x) {
      return t;
    }
    t <<= 1;
  }
  return 0;
}

355
356
357
358
/*!
 * \brief Do inplace softmax transformaton on p_rec
 * \param p_rec The input/output vector of the values.
 */
359
360
361
inline void Softmax(std::vector<double>* p_rec) {
  std::vector<double> &rec = *p_rec;
  double wmax = rec[0];
362
363
364
  for (size_t i = 1; i < rec.size(); ++i) {
    wmax = std::max(rec[i], wmax);
  }
365
  double wsum = 0.0f;
366
367
368
369
370
  for (size_t i = 0; i < rec.size(); ++i) {
    rec[i] = std::exp(rec[i] - wmax);
    wsum += rec[i];
  }
  for (size_t i = 0; i < rec.size(); ++i) {
371
    rec[i] /= static_cast<double>(wsum);
372
373
374
  }
}

Guolin Ke's avatar
Guolin Ke committed
375
376
377
378
379
template<typename T>
std::vector<const T*> ConstPtrInVectorWrapper(const std::vector<std::unique_ptr<T>>& input) {
  std::vector<const T*> ret;
  for (size_t i = 0; i < input.size(); ++i) {
    ret.push_back(input.at(i).get());
380
  }
Guolin Ke's avatar
Guolin Ke committed
381
  return ret;
382
383
}

Guolin Ke's avatar
Guolin Ke committed
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
template<typename T1, typename T2>
inline void SortForPair(std::vector<T1>& keys, std::vector<T2>& values, size_t start, bool is_reverse = false) {
  std::vector<std::pair<T1, T2>> arr;
  for (size_t i = start; i < keys.size(); ++i) {
    arr.emplace_back(keys[i], values[i]);
  }
  if (!is_reverse) {
    std::sort(arr.begin(), arr.end(), [](const std::pair<T1, T2>& a, const std::pair<T1, T2>& b) {
      return a.first < b.first;
    });
  } else {
    std::sort(arr.begin(), arr.end(), [](const std::pair<T1, T2>& a, const std::pair<T1, T2>& b) {
      return a.first > b.first;
    });
  }
  for (size_t i = start; i < arr.size(); ++i) {
    keys[i] = arr[i].first;
    values[i] = arr[i].second;
  }

}

406
407
408
409
410
/*
* approximate hessians of absolute loss with Gaussian function
* cf. https://en.wikipedia.org/wiki/Gaussian_function
*
* y is a prediction.
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
411
412
* t means true target.
* g means gradient.
413
* eta is a parameter to control the width of Gaussian function.
414
415
* w means weights.
*/
416
417
inline static double ApproximateHessianWithGaussian(const double y, const double t, const double g,
                                                    const double eta, const double w=1.0f) {
418
  const double diff = y - t;
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
419
420
421
  const double pi = 4.0 * std::atan(1.0);
  const double x = std::fabs(diff);
  const double a = 2.0 * std::fabs(g) * w;  // difference of two first derivatives, (zero to inf) and (zero to -inf).
422
  const double b = 0.0;
423
  const double c = std::max((std::fabs(y) + std::fabs(t)) * eta, 1.0e-10);
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
424
  return w * std::exp(-(x - b) * (x - b) / (2.0 * c * c)) * a / (c * std::sqrt(2 * pi));
425
426
}

427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
template <typename T>
inline static T** Vector2Ptr(std::vector<std::vector<T>>& data) {
  T** ptr = new T*[data.size()];
  for (size_t i = 0; i < data.size(); ++i) {
    ptr[i] = data[i].data();
  }
  return ptr;
}

template <typename T>
inline static std::vector<int> VectorSize(const std::vector<std::vector<T>>& data) {
  std::vector<int> ret(data.size());
  for (size_t i = 0; i < data.size(); ++i) {
    ret[i] = static_cast<int>(data[i].size());
  }
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
445
446
447
448
}  // namespace Common

}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
449
#endif   // LightGBM_UTILS_COMMON_FUN_H_