parser.hpp 2.76 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#ifndef LIGHTGBM_IO_PARSER_HPP_
#define LIGHTGBM_IO_PARSER_HPP_

#include <LightGBM/utils/common.h>
#include <LightGBM/utils/log.h>

#include <LightGBM/dataset.h>

#include <unordered_map>
#include <vector>
#include <utility>

namespace LightGBM {

class CSVParser: public Parser {
public:
Guolin Ke's avatar
Guolin Ke committed
17
18
19
  explicit CSVParser(int label_idx)
    :label_idx_(label_idx) {
  }
Guolin Ke's avatar
Guolin Ke committed
20
  inline void ParseOneLine(const char* str,
21
    std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
Guolin Ke's avatar
Guolin Ke committed
22
    int idx = 0;
23
    double val = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
24
25
    int bias = 0;
    *out_label = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
26
27
    while (*str != '\0') {
      str = Common::Atof(str, &val);
Guolin Ke's avatar
Guolin Ke committed
28
29
30
31
      if (idx == label_idx_) {
        *out_label = val;
        bias = -1;
      }
Guolin Ke's avatar
Guolin Ke committed
32
      else if (std::fabs(val) > kZeroThreshold || std::isnan(val)) {
Guolin Ke's avatar
Guolin Ke committed
33
        out_features->emplace_back(idx + bias, val);
Guolin Ke's avatar
Guolin Ke committed
34
      }
Guolin Ke's avatar
Guolin Ke committed
35
36
37
38
      ++idx;
      if (*str == ',') {
        ++str;
      } else if (*str != '\0') {
39
        Log::Fatal("Input format error when parsing as CSV");
Guolin Ke's avatar
Guolin Ke committed
40
41
42
      }
    }
  }
Guolin Ke's avatar
Guolin Ke committed
43
44
private:
  int label_idx_ = 0;
Guolin Ke's avatar
Guolin Ke committed
45
46
47
48
};

class TSVParser: public Parser {
public:
Guolin Ke's avatar
Guolin Ke committed
49
50
51
  explicit TSVParser(int label_idx)
    :label_idx_(label_idx) {
  }
52
  inline void ParseOneLine(const char* str,
53
    std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
Guolin Ke's avatar
Guolin Ke committed
54
    int idx = 0;
55
    double val = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
56
    int bias = 0;
Guolin Ke's avatar
Guolin Ke committed
57
58
    while (*str != '\0') {
      str = Common::Atof(str, &val);
Guolin Ke's avatar
Guolin Ke committed
59
60
61
      if (idx == label_idx_) {
        *out_label = val;
        bias = -1;
Guolin Ke's avatar
Guolin Ke committed
62
      } else if (std::fabs(val) > kZeroThreshold || std::isnan(val)) {
Guolin Ke's avatar
Guolin Ke committed
63
        out_features->emplace_back(idx + bias, val);
Guolin Ke's avatar
Guolin Ke committed
64
      }
Guolin Ke's avatar
Guolin Ke committed
65
66
67
68
      ++idx;
      if (*str == '\t') {
        ++str;
      } else if (*str != '\0') {
69
        Log::Fatal("Input format error when parsing as TSV");
Guolin Ke's avatar
Guolin Ke committed
70
71
72
      }
    }
  }
Guolin Ke's avatar
Guolin Ke committed
73
74
private:
  int label_idx_ = 0;
Guolin Ke's avatar
Guolin Ke committed
75
76
77
78
};

class LibSVMParser: public Parser {
public:
Guolin Ke's avatar
Guolin Ke committed
79
80
81
  explicit LibSVMParser(int label_idx)
    :label_idx_(label_idx) {
    if (label_idx > 0) {
82
      Log::Fatal("Label should be the first column in a LibSVM file");
Guolin Ke's avatar
Guolin Ke committed
83
84
    }
  }
85
  inline void ParseOneLine(const char* str,
86
    std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
Guolin Ke's avatar
Guolin Ke committed
87
    int idx = 0;
88
    double val = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
89
90
91
92
93
    if (label_idx_ == 0) {
      str = Common::Atof(str, &val);
      *out_label = val;
      str = Common::SkipSpaceAndTab(str);
    }
Guolin Ke's avatar
Guolin Ke committed
94
95
96
97
98
99
100
101
    while (*str != '\0') {
      str = Common::Atoi(str, &idx);
      str = Common::SkipSpaceAndTab(str);
      if (*str == ':') {
        ++str;
        str = Common::Atof(str, &val);
        out_features->emplace_back(idx, val);
      } else {
102
        Log::Fatal("Input format error when parsing as LibSVM");
Guolin Ke's avatar
Guolin Ke committed
103
104
105
106
      }
      str = Common::SkipSpaceAndTab(str);
    }
  }
Guolin Ke's avatar
Guolin Ke committed
107
108
private:
  int label_idx_ = 0;
Guolin Ke's avatar
Guolin Ke committed
109
};
Guolin Ke's avatar
Guolin Ke committed
110

Guolin Ke's avatar
Guolin Ke committed
111
}  // namespace LightGBM
Guolin Ke's avatar
Guolin Ke committed
112
#endif   // LightGBM_IO_PARSER_HPP_