common.h 15.5 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
8
9
10
#ifndef LIGHTGBM_UTILS_COMMON_FUN_H_
#define LIGHTGBM_UTILS_COMMON_FUN_H_

#include <LightGBM/utils/log.h>

#include <cstdio>
#include <string>
#include <vector>
#include <sstream>
#include <cstdint>
11
#include <algorithm>
12
#include <cmath>
Guolin Ke's avatar
Guolin Ke committed
13
#include <functional>
Guolin Ke's avatar
Guolin Ke committed
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39

namespace LightGBM {

namespace Common {

template<typename T>
inline static T Max(const T& a, const T& b) {
  return a > b ? a : b;
}

template<typename T>
inline static T Min(const T& a, const T& b) {
  return a < b ? a : b;
}



inline static std::string& Trim(std::string& str) {
  if (str.size() <= 0) {
    return str;
  }
  str.erase(str.find_last_not_of(" \f\n\r\t\v") + 1);
  str.erase(0, str.find_first_not_of(" \f\n\r\t\v"));
  return str;
}

40
41
42
43
44
45
46
47
inline static std::string& RemoveQuotationSymbol(std::string& str) {
  if (str.size() <= 0) {
    return str;
  }
  str.erase(str.find_last_not_of("'\"") + 1);
  str.erase(0, str.find_first_not_of("'\""));
  return str;
}
Guolin Ke's avatar
Guolin Ke committed
48
49
50
51
52
53
54
inline static bool StartsWith(const std::string& str, const std::string prefix) {
  if (str.substr(0, prefix.size()) == prefix) {
    return true;
  } else {
    return false;
  }
}
Guolin Ke's avatar
Guolin Ke committed
55
inline static std::vector<std::string> Split(const char* c_str, char delimiter) {
Guolin Ke's avatar
Guolin Ke committed
56
  std::vector<std::string> ret;
Guolin Ke's avatar
Guolin Ke committed
57
58
59
60
61
62
63
  std::string str(c_str);
  size_t i = 0;
  size_t pos = str.find(delimiter);
  while (pos != std::string::npos) {
    ret.push_back(str.substr(i, pos - i));
    i = ++pos;
    pos = str.find(delimiter, pos);
Guolin Ke's avatar
Guolin Ke committed
64
  }
Guolin Ke's avatar
Guolin Ke committed
65
  ret.push_back(str.substr(i));
Guolin Ke's avatar
Guolin Ke committed
66
67
68
  return ret;
}

Guolin Ke's avatar
Guolin Ke committed
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
inline static std::vector<std::string> Split(const char* c_str, const char* delimiters) {
  // will split when met any chars in delimiters
  std::vector<std::string> ret;
  std::string str(c_str);
  size_t i = 0;
  size_t pos = str.find_first_of(delimiters);
  while (pos != std::string::npos) {
    ret.push_back(str.substr(i, pos - i));
    i = ++pos;
    pos = str.find_first_of(delimiters, pos);
  }
  ret.push_back(str.substr(i));
  return ret;
}

84
85
86
87
88
89
90
91
92
93
template<typename T>
inline std::string Join(const std::vector<T>& data, char delimiters) {
  std::stringstream result_stream_buf;
  result_stream_buf << data[0];
  for (size_t i = 1; i < data.size(); ++i) {
    result_stream_buf << delimiters << data[i];
  }
  return result_stream_buf.str();
}

Guolin Ke's avatar
Guolin Ke committed
94
95
96
97
98
99
100
101
102
inline static const char* Atoi(const char* p, int* out) {
  int sign, value;
  while (*p == ' ') {
    ++p;
  }
  sign = 1;
  if (*p == '-') {
    sign = -1;
    ++p;
103
  } else if (*p == '+') {
Guolin Ke's avatar
Guolin Ke committed
104
105
106
107
108
109
110
111
112
113
114
115
    ++p;
  }
  for (value = 0; *p >= '0' && *p <= '9'; ++p) {
    value = value * 10 + (*p - '0');
  }
  *out = sign * value;
  while (*p == ' ') {
    ++p;
  }
  return p;
}

116
inline static const char* Atof(const char* p, double* out) {
Guolin Ke's avatar
Guolin Ke committed
117
  int frac;
118
  double sign, value, scale;
Guolin Ke's avatar
Guolin Ke committed
119
  *out = 0;
Guolin Ke's avatar
Guolin Ke committed
120
121
122
123
124
125
  // Skip leading white space, if any.
  while (*p == ' ') {
    ++p;
  }

  // Get sign, if any.
126
  sign = 1.0;
Guolin Ke's avatar
Guolin Ke committed
127
  if (*p == '-') {
128
    sign = -1.0;
Guolin Ke's avatar
Guolin Ke committed
129
    ++p;
130
  } else if (*p == '+') {
Guolin Ke's avatar
Guolin Ke committed
131
132
133
    ++p;
  }

Guolin Ke's avatar
Guolin Ke committed
134
135
136
  // is a number
  if ((*p >= '0' && *p <= '9') || *p == '.' || *p == 'e' || *p == 'E') {
    // Get digits before decimal point or exponent, if any.
137
138
    for (value = 0.0; *p >= '0' && *p <= '9'; ++p) {
      value = value * 10.0 + (*p - '0');
Guolin Ke's avatar
Guolin Ke committed
139
    }
Guolin Ke's avatar
Guolin Ke committed
140

Guolin Ke's avatar
Guolin Ke committed
141
142
    // Get digits after decimal point, if any.
    if (*p == '.') {
143
      double pow10 = 10.0;
Guolin Ke's avatar
Guolin Ke committed
144
      ++p;
Guolin Ke's avatar
Guolin Ke committed
145
146
      while (*p >= '0' && *p <= '9') {
        value += (*p - '0') / pow10;
147
        pow10 *= 10.0;
Guolin Ke's avatar
Guolin Ke committed
148
149
        ++p;
      }
Guolin Ke's avatar
Guolin Ke committed
150
151
    }

Guolin Ke's avatar
Guolin Ke committed
152
153
    // Handle exponent, if any.
    frac = 0;
154
    scale = 1.0;
Guolin Ke's avatar
Guolin Ke committed
155
156
157
    if ((*p == 'e') || (*p == 'E')) {
      unsigned int expon;
      // Get sign of exponent, if any.
Guolin Ke's avatar
Guolin Ke committed
158
      ++p;
Guolin Ke's avatar
Guolin Ke committed
159
160
161
162
163
164
165
166
167
168
      if (*p == '-') {
        frac = 1;
        ++p;
      } else if (*p == '+') {
        ++p;
      }
      // Get digits of exponent, if any.
      for (expon = 0; *p >= '0' && *p <= '9'; ++p) {
        expon = expon * 10 + (*p - '0');
      }
169
170
171
      if (expon > 308) expon = 308;
      // Calculate scaling factor.
      while (expon >= 50) { scale *= 1E50; expon -= 50; }
Guolin Ke's avatar
Guolin Ke committed
172
      while (expon >= 8) { scale *= 1E8;  expon -= 8; }
173
      while (expon > 0) { scale *= 10.0; expon -= 1; }
Guolin Ke's avatar
Guolin Ke committed
174
    }
Guolin Ke's avatar
Guolin Ke committed
175
176
177
    // Return signed and scaled floating point result.
    *out = sign * (frac ? (value / scale) : (value * scale));
  } else {
178
    size_t cnt = 0;
179
    while (*(p + cnt) != '\0' && *(p + cnt) != ' '
180
181
      && *(p + cnt) != '\t' && *(p + cnt) != ','
      && *(p + cnt) != '\n' && *(p + cnt) != '\r'
182
      && *(p + cnt) != ':') {
183
184
      ++cnt;
    }
185
    if (cnt > 0) {
Guolin Ke's avatar
Guolin Ke committed
186
187
188
      std::string tmp_str(p, cnt);
      std::transform(tmp_str.begin(), tmp_str.end(), tmp_str.begin(), ::tolower);
      if (tmp_str == std::string("na") || tmp_str == std::string("nan")) {
189
        *out = 0;
190
      } else if (tmp_str == std::string("inf") || tmp_str == std::string("infinity")) {
191
        *out = sign * 1e308;
192
      } else {
193
        Log::Fatal("Unknown token %s in data file", tmp_str.c_str());
Guolin Ke's avatar
Guolin Ke committed
194
195
      }
      p += cnt;
196
    }
Guolin Ke's avatar
Guolin Ke committed
197
  }
Guolin Ke's avatar
Guolin Ke committed
198

Guolin Ke's avatar
Guolin Ke committed
199
200
201
  while (*p == ' ') {
    ++p;
  }
Guolin Ke's avatar
Guolin Ke committed
202

Guolin Ke's avatar
Guolin Ke committed
203
204
205
  return p;
}

206
207


208
209
210
211
212
213
214
215
inline bool AtoiAndCheck(const char* p, int* out) {
  const char* after = Atoi(p, out);
  if (*after != '\0') {
    return false;
  }
  return true;
}

216
inline bool AtofAndCheck(const char* p, double* out) {
217
218
219
220
221
222
223
  const char* after = Atof(p, out);
  if (*after != '\0') {
    return false;
  }
  return true;
}

Guolin Ke's avatar
Guolin Ke committed
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
inline static const char* SkipSpaceAndTab(const char* p) {
  while (*p == ' ' || *p == '\t') {
    ++p;
  }
  return p;
}

inline static const char* SkipReturn(const char* p) {
  while (*p == '\n' || *p == '\r' || *p == ' ') {
    ++p;
  }
  return p;
}

template<typename T>
inline static std::string ArrayToString(const T* arr, int n, char delimiter) {
  if (n <= 0) {
    return std::string("");
  }
243
244
  std::stringstream str_buf;
  str_buf << arr[0];
Guolin Ke's avatar
Guolin Ke committed
245
  for (int i = 1; i < n; ++i) {
246
247
    str_buf << delimiter;
    str_buf << arr[i];
Guolin Ke's avatar
Guolin Ke committed
248
  }
249
  return str_buf.str();
Guolin Ke's avatar
Guolin Ke committed
250
251
}

252
253
254
255
template<typename T>
inline static std::string ArrayToString(std::vector<T> arr, char delimiter) {
  if (arr.size() <= 0) {
    return std::string("");
Guolin Ke's avatar
Guolin Ke committed
256
  }
257
258
259
260
261
  std::stringstream str_buf;
  str_buf << arr[0];
  for (size_t i = 1; i < arr.size(); ++i) {
    str_buf << delimiter;
    str_buf << arr[i];
Guolin Ke's avatar
Guolin Ke committed
262
  }
263
  return str_buf.str();
Guolin Ke's avatar
Guolin Ke committed
264
265
}

266
inline static void StringToIntArray(const std::string& str, char delimiter, size_t n, int* out) {
Guolin Ke's avatar
Guolin Ke committed
267
268
  std::vector<std::string> strs = Split(str.c_str(), delimiter);
  if (strs.size() != n) {
269
    Log::Fatal("StringToIntArray error, size doesn't match.");
Guolin Ke's avatar
Guolin Ke committed
270
271
272
  }
  for (size_t i = 0; i < strs.size(); ++i) {
    strs[i] = Trim(strs[i]);
273
    Atoi(strs[i].c_str(), &out[i]);
Guolin Ke's avatar
Guolin Ke committed
274
275
276
  }
}

277
278

inline static void StringToDoubleArray(const std::string& str, char delimiter, size_t n, double* out) {
Guolin Ke's avatar
Guolin Ke committed
279
280
  std::vector<std::string> strs = Split(str.c_str(), delimiter);
  if (strs.size() != n) {
281
    Log::Fatal("StringToDoubleArray error, size doesn't match.");
Guolin Ke's avatar
Guolin Ke committed
282
283
284
  }
  for (size_t i = 0; i < strs.size(); ++i) {
    strs[i] = Trim(strs[i]);
285
    Atof(strs[i].c_str(), &out[i]);
Guolin Ke's avatar
Guolin Ke committed
286
287
288
  }
}

289
inline static std::vector<double> StringToDoubleArray(const std::string& str, char delimiter) {
Guolin Ke's avatar
Guolin Ke committed
290
  std::vector<std::string> strs = Split(str.c_str(), delimiter);
291
  std::vector<double> ret;
Guolin Ke's avatar
Guolin Ke committed
292
293
  for (size_t i = 0; i < strs.size(); ++i) {
    strs[i] = Trim(strs[i]);
294
    double val = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
    Atof(strs[i].c_str(), &val);
    ret.push_back(val);
  }
  return ret;
}

inline static std::vector<int> StringToIntArray(const std::string& str, char delimiter) {
  std::vector<std::string> strs = Split(str.c_str(), delimiter);
  std::vector<int> ret;
  for (size_t i = 0; i < strs.size(); ++i) {
    strs[i] = Trim(strs[i]);
    int val = 0;
    Atoi(strs[i].c_str(), &val);
    ret.push_back(val);
  }
  return ret;
}

inline static std::string Join(const std::vector<std::string>& strs, char delimiter) {
  if (strs.size() <= 0) {
    return std::string("");
  }
  std::stringstream ss;
  ss << strs[0];
  for (size_t i = 1; i < strs.size(); ++i) {
    ss << delimiter;
    ss << strs[i];
  }
  return ss.str();
}

inline static std::string Join(const std::vector<std::string>& strs, size_t start, size_t end, char delimiter) {
  if (end - start <= 0) {
    return std::string("");
  }
  start = Min<size_t>(start, static_cast<size_t>(strs.size()) - 1);
  end = Min<size_t>(end, static_cast<size_t>(strs.size()));
  std::stringstream ss;
  ss << strs[start];
  for (size_t i = start + 1; i < end; ++i) {
    ss << delimiter;
    ss << strs[i];
  }
  return ss.str();
}

static inline int64_t Pow2RoundUp(int64_t x) {
  int64_t t = 1;
  for (int i = 0; i < 64; ++i) {
    if (t >= x) {
      return t;
    }
    t <<= 1;
  }
  return 0;
}

352
353
354
355
/*!
 * \brief Do inplace softmax transformaton on p_rec
 * \param p_rec The input/output vector of the values.
 */
356
357
358
inline void Softmax(std::vector<double>* p_rec) {
  std::vector<double> &rec = *p_rec;
  double wmax = rec[0];
359
360
361
  for (size_t i = 1; i < rec.size(); ++i) {
    wmax = std::max(rec[i], wmax);
  }
362
  double wsum = 0.0f;
363
364
365
366
367
  for (size_t i = 0; i < rec.size(); ++i) {
    rec[i] = std::exp(rec[i] - wmax);
    wsum += rec[i];
  }
  for (size_t i = 0; i < rec.size(); ++i) {
368
    rec[i] /= static_cast<double>(wsum);
369
370
371
  }
}

372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
template<typename T1, typename T2>
inline void SortForPair(std::vector<T1>& keys, std::vector<T2>& values, size_t start, bool is_reverse = false) {
  std::vector<std::pair<T1, T2>> arr;
  for (size_t i = start; i < keys.size(); ++i) {
    arr.emplace_back(keys[i], values[i]);
  }
  if (!is_reverse) {
    std::sort(arr.begin(), arr.end(), [](const std::pair<T1, T2>& a, const std::pair<T1, T2>& b) {
      return a.first < b.first;
    });
  } else {
    std::sort(arr.begin(), arr.end(), [](const std::pair<T1, T2>& a, const std::pair<T1, T2>& b) {
      return a.first > b.first;
    });
  }
  for (size_t i = start; i < arr.size(); ++i) {
    keys[i] = arr[i].first;
    values[i] = arr[i].second;
  }

}

394
inline std::function<std::vector<double>(int row_idx)>
Guolin Ke's avatar
Guolin Ke committed
395
RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int float_type, int is_row_major) {
Guolin Ke's avatar
Guolin Ke committed
396
  if (float_type == 0) {
397
    const float* dptr = reinterpret_cast<const float*>(data);
Guolin Ke's avatar
Guolin Ke committed
398
    if (is_row_major) {
399
400
      return [&dptr, &num_col, &num_row](int row_idx) {
        CHECK(row_idx < num_row);
Guolin Ke's avatar
Guolin Ke committed
401
402
403
404
405
406
407
408
        std::vector<double> ret;
        dptr += num_col * row_idx;
        for (int i = 0; i < num_col; ++i) {
          ret.push_back(static_cast<double>(*(dptr + i)));
        }
        return ret;
      };
    } else {
409
410
      return [&dptr, &num_col, &num_row](int row_idx) {
        CHECK(row_idx < num_row);
Guolin Ke's avatar
Guolin Ke committed
411
412
413
414
415
416
417
418
        std::vector<double> ret;
        for (int i = 0; i < num_col; ++i) {
          ret.push_back(static_cast<double>(*(dptr + num_row * i + row_idx)));
        }
        return ret;
      };
    }
  } else {
419
    const double* dptr = reinterpret_cast<const double*>(data);
Guolin Ke's avatar
Guolin Ke committed
420
    if (is_row_major) {
421
422
      return [&dptr, &num_col, &num_row](int row_idx) {
        CHECK(row_idx < num_row);
Guolin Ke's avatar
Guolin Ke committed
423
424
425
426
427
428
429
430
        std::vector<double> ret;
        dptr += num_col * row_idx;
        for (int i = 0; i < num_col; ++i) {
          ret.push_back(static_cast<double>(*(dptr + i)));
        }
        return ret;
      };
    } else {
431
432
      return [&dptr, &num_col, &num_row](int row_idx) {
        CHECK(row_idx < num_row);
Guolin Ke's avatar
Guolin Ke committed
433
434
435
436
437
438
439
440
441
442
        std::vector<double> ret;
        for (int i = 0; i < num_col; ++i) {
          ret.push_back(static_cast<double>(*(dptr + num_row * i + row_idx)));
        }
        return ret;
      };
    }
  }
}

443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
inline std::function<std::vector<std::pair<int, double>>(int row_idx)>
RowPairFunctionFromDenseMatric(const void* data, int num_row, int num_col, int float_type, int is_row_major) {
  if (float_type == 0) {
    const float* dptr = reinterpret_cast<const float*>(data);
    if (is_row_major) {
      return [&dptr, &num_col, &num_row](int row_idx) {
        CHECK(row_idx < num_row);
        std::vector<std::pair<int, double>> ret;
        dptr += num_col * row_idx;
        for (int i = 0; i < num_col; ++i) {
          ret.emplace_back(i, static_cast<double>(*(dptr + i)));
        }
        return ret;
      };
    } else {
      return [&dptr, &num_col, &num_row](int row_idx) {
        CHECK(row_idx < num_row);
        std::vector<std::pair<int, double>> ret;
        for (int i = 0; i < num_col; ++i) {
          ret.emplace_back(i, static_cast<double>(*(dptr + num_row * i + row_idx)));
        }
        return ret;
      };
    }
  } else {
    const double* dptr = reinterpret_cast<const double*>(data);
    if (is_row_major) {
      return [&dptr, &num_col, &num_row](int row_idx) {
        CHECK(row_idx < num_row);
        std::vector<std::pair<int, double>> ret;
        dptr += num_col * row_idx;
        for (int i = 0; i < num_col; ++i) {
          ret.emplace_back(i, static_cast<double>(*(dptr + i)));
        }
        return ret;
      };
    } else {
      return [&dptr, &num_col, &num_row](int row_idx) {
        CHECK(row_idx < num_row);
        std::vector<std::pair<int, double>> ret;
        for (int i = 0; i < num_col; ++i) {
          ret.emplace_back(i, static_cast<double>(*(dptr + num_row * i + row_idx)));
        }
        return ret;
      };
    }
  }
}
491
492

inline std::function<std::vector<std::pair<int, double>>(int idx)>
Guolin Ke's avatar
Guolin Ke committed
493
RowFunctionFromCSR(const int32_t* indptr, const int32_t* indices, const void* data, int float_type, uint64_t nindptr, uint64_t nelem) {
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
  if (float_type == 0) {
    const float* dptr = reinterpret_cast<const float*>(data);
    return [&indptr, &indices, &dptr, &nindptr, &nelem](int idx) {
      CHECK(idx + 1 < nindptr);
      std::vector<std::pair<int, double>> ret;
      int32_t start = indptr[idx];
      int32_t end = indptr[idx + 1];
      CHECK(start >= 0 && end < nelem);
      for (int32_t i = start; i < end; ++i) {
        ret.emplace_back(indices[i], dptr[i]);
      }
      return ret;
    };
  } else {
    const double* dptr = reinterpret_cast<const double*>(data);
    return [&indptr, &indices, &dptr, &nindptr, &nelem](int idx) {
      CHECK(idx + 1 < nindptr);
      std::vector<std::pair<int, double>> ret;
      int32_t start = indptr[idx];
      int32_t end = indptr[idx + 1];
      CHECK(start >= 0 && end < nelem);
      for (int32_t i = start; i < end; ++i) {
        ret.emplace_back(indices[i], dptr[i]);
      }
      return ret;
    };
  }
}

Guolin Ke's avatar
Guolin Ke committed
523
inline std::function<std::vector<std::pair<int, double>>(int idx)>
Guolin Ke's avatar
Guolin Ke committed
524
ColumnFunctionFromCSC(const int32_t* col_ptr, const int32_t* indices, const void* data, int float_type, uint64_t ncol_ptr, uint64_t nelem) {
Guolin Ke's avatar
Guolin Ke committed
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
  if (float_type == 0) {
    const float* dptr = reinterpret_cast<const float*>(data);
    return [&col_ptr, &indices, &dptr, &ncol_ptr, &nelem](int idx) {
      CHECK(idx + 1 < ncol_ptr);
      std::vector<std::pair<int, double>> ret;
      int32_t start = col_ptr[idx];
      int32_t end = col_ptr[idx + 1];
      CHECK(start >= 0 && end < nelem);
      for (int32_t i = start; i < end; ++i) {
        ret.emplace_back(indices[i], dptr[i]);
      }
      return ret;
    };
  } else {
    const double* dptr = reinterpret_cast<const double*>(data);
    return [&col_ptr, &indices, &dptr, &ncol_ptr, &nelem](int idx) {
      CHECK(idx + 1 < ncol_ptr);
      std::vector<std::pair<int, double>> ret;
      int32_t start = col_ptr[idx];
      int32_t end = col_ptr[idx + 1];
      CHECK(start >= 0 && end < nelem);
      for (int32_t i = start; i < end; ++i) {
        ret.emplace_back(indices[i], dptr[i]);
      }
      return ret;
    };
  }
}

inline std::vector<double> SampleFromOneColumn(const std::vector<std::pair<int, double>>& data, const std::vector<size_t>& indices) {
  size_t j = 0;
  std::vector<double> ret;
  for (auto row_idx : indices) {
    while (j < data.size() && data[j].first < row_idx) {
      ++j;
    }
    if (j < data.size() && data[j].first == row_idx) {
      ret.push_back(data[j].second);
    } else {
      ret.push_back(0);
    }
  }
  return ret;
}
569

Guolin Ke's avatar
Guolin Ke committed
570
571
572
573
}  // namespace Common

}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
574
#endif   // LightGBM_UTILS_COMMON_FUN_H_