parser.hpp 3.13 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
#ifndef LIGHTGBM_IO_PARSER_HPP_
#define LIGHTGBM_IO_PARSER_HPP_
Guolin Ke's avatar
Guolin Ke committed
3
#include <LightGBM/dataset.h>
Guolin Ke's avatar
Guolin Ke committed
4
5
6
7
8
9
10
11
12
13
14
15

#include <LightGBM/utils/common.h>
#include <LightGBM/utils/log.h>


#include <unordered_map>
#include <vector>
#include <utility>

namespace LightGBM {

class CSVParser: public Parser {
Nikita Titov's avatar
Nikita Titov committed
16
 public:
Guolin Ke's avatar
Guolin Ke committed
17
18
  explicit CSVParser(int label_idx, int total_columns)
    :label_idx_(label_idx), total_columns_(total_columns) {
Guolin Ke's avatar
Guolin Ke committed
19
  }
Guolin Ke's avatar
Guolin Ke committed
20
  inline void ParseOneLine(const char* str,
21
    std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
Guolin Ke's avatar
Guolin Ke committed
22
    int idx = 0;
23
    double val = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
24
25
    int bias = 0;
    *out_label = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
26
27
    while (*str != '\0') {
      str = Common::Atof(str, &val);
Guolin Ke's avatar
Guolin Ke committed
28
29
30
31
      if (idx == label_idx_) {
        *out_label = val;
        bias = -1;
      }
Guolin Ke's avatar
Guolin Ke committed
32
      else if (std::fabs(val) > kZeroThreshold || std::isnan(val)) {
Guolin Ke's avatar
Guolin Ke committed
33
        out_features->emplace_back(idx + bias, val);
Guolin Ke's avatar
Guolin Ke committed
34
      }
Guolin Ke's avatar
Guolin Ke committed
35
36
37
38
      ++idx;
      if (*str == ',') {
        ++str;
      } else if (*str != '\0') {
39
        Log::Fatal("Input format error when parsing as CSV");
Guolin Ke's avatar
Guolin Ke committed
40
41
42
      }
    }
  }
Guolin Ke's avatar
Guolin Ke committed
43
44
45
46

  inline int TotalColumns() const override {
    return total_columns_;
  }
47

Nikita Titov's avatar
Nikita Titov committed
48
 private:
Guolin Ke's avatar
Guolin Ke committed
49
  int label_idx_ = 0;
Guolin Ke's avatar
Guolin Ke committed
50
  int total_columns_ = -1;
Guolin Ke's avatar
Guolin Ke committed
51
52
53
};

class TSVParser: public Parser {
Nikita Titov's avatar
Nikita Titov committed
54
 public:
Guolin Ke's avatar
Guolin Ke committed
55
56
  explicit TSVParser(int label_idx, int total_columns)
    :label_idx_(label_idx), total_columns_(total_columns) {
Guolin Ke's avatar
Guolin Ke committed
57
  }
58
  inline void ParseOneLine(const char* str,
59
    std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
Guolin Ke's avatar
Guolin Ke committed
60
    int idx = 0;
61
    double val = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
62
    int bias = 0;
Guolin Ke's avatar
Guolin Ke committed
63
64
    while (*str != '\0') {
      str = Common::Atof(str, &val);
Guolin Ke's avatar
Guolin Ke committed
65
66
67
      if (idx == label_idx_) {
        *out_label = val;
        bias = -1;
Guolin Ke's avatar
Guolin Ke committed
68
      } else if (std::fabs(val) > kZeroThreshold || std::isnan(val)) {
Guolin Ke's avatar
Guolin Ke committed
69
        out_features->emplace_back(idx + bias, val);
Guolin Ke's avatar
Guolin Ke committed
70
      }
Guolin Ke's avatar
Guolin Ke committed
71
72
73
74
      ++idx;
      if (*str == '\t') {
        ++str;
      } else if (*str != '\0') {
75
        Log::Fatal("Input format error when parsing as TSV");
Guolin Ke's avatar
Guolin Ke committed
76
77
78
      }
    }
  }
Guolin Ke's avatar
Guolin Ke committed
79
80
81
82

  inline int TotalColumns() const override {
    return total_columns_;
  }
83

Nikita Titov's avatar
Nikita Titov committed
84
 private:
Guolin Ke's avatar
Guolin Ke committed
85
  int label_idx_ = 0;
Guolin Ke's avatar
Guolin Ke committed
86
  int total_columns_ = -1;
Guolin Ke's avatar
Guolin Ke committed
87
88
89
};

class LibSVMParser: public Parser {
Nikita Titov's avatar
Nikita Titov committed
90
 public:
Guolin Ke's avatar
Guolin Ke committed
91
92
93
  explicit LibSVMParser(int label_idx)
    :label_idx_(label_idx) {
    if (label_idx > 0) {
94
      Log::Fatal("Label should be the first column in a LibSVM file");
Guolin Ke's avatar
Guolin Ke committed
95
96
    }
  }
97
  inline void ParseOneLine(const char* str,
98
    std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
Guolin Ke's avatar
Guolin Ke committed
99
    int idx = 0;
100
    double val = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
101
102
103
104
105
    if (label_idx_ == 0) {
      str = Common::Atof(str, &val);
      *out_label = val;
      str = Common::SkipSpaceAndTab(str);
    }
Guolin Ke's avatar
Guolin Ke committed
106
107
108
109
110
111
112
113
    while (*str != '\0') {
      str = Common::Atoi(str, &idx);
      str = Common::SkipSpaceAndTab(str);
      if (*str == ':') {
        ++str;
        str = Common::Atof(str, &val);
        out_features->emplace_back(idx, val);
      } else {
114
        Log::Fatal("Input format error when parsing as LibSVM");
Guolin Ke's avatar
Guolin Ke committed
115
116
117
118
      }
      str = Common::SkipSpaceAndTab(str);
    }
  }
Guolin Ke's avatar
Guolin Ke committed
119
120
121
122

  inline int TotalColumns() const override {
    return -1;
  }
123

Nikita Titov's avatar
Nikita Titov committed
124
 private:
Guolin Ke's avatar
Guolin Ke committed
125
  int label_idx_ = 0;
Guolin Ke's avatar
Guolin Ke committed
126
};
Guolin Ke's avatar
Guolin Ke committed
127

Guolin Ke's avatar
Guolin Ke committed
128
}  // namespace LightGBM
Guolin Ke's avatar
Guolin Ke committed
129
#endif   // LightGBM_IO_PARSER_HPP_