parser.hpp 3.13 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
#ifndef LIGHTGBM_IO_PARSER_HPP_
#define LIGHTGBM_IO_PARSER_HPP_
Guolin Ke's avatar
Guolin Ke committed
3
#include <LightGBM/dataset.h>
Guolin Ke's avatar
Guolin Ke committed
4
5
6
7
8
9
10
11
12
13
14
15

#include <LightGBM/utils/common.h>
#include <LightGBM/utils/log.h>


#include <unordered_map>
#include <vector>
#include <utility>

namespace LightGBM {

class CSVParser: public Parser {
Nikita Titov's avatar
Nikita Titov committed
16
 public:
Guolin Ke's avatar
Guolin Ke committed
17
18
  explicit CSVParser(int label_idx, int total_columns)
    :label_idx_(label_idx), total_columns_(total_columns) {
Guolin Ke's avatar
Guolin Ke committed
19
  }
Guolin Ke's avatar
Guolin Ke committed
20
  inline void ParseOneLine(const char* str,
21
    std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
Guolin Ke's avatar
Guolin Ke committed
22
    int idx = 0;
23
    double val = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
24
25
    int bias = 0;
    *out_label = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
26
27
    while (*str != '\0') {
      str = Common::Atof(str, &val);
Guolin Ke's avatar
Guolin Ke committed
28
29
30
      if (idx == label_idx_) {
        *out_label = val;
        bias = -1;
31
      } else if (std::fabs(val) > kZeroThreshold || std::isnan(val)) {
Guolin Ke's avatar
Guolin Ke committed
32
        out_features->emplace_back(idx + bias, val);
Guolin Ke's avatar
Guolin Ke committed
33
      }
Guolin Ke's avatar
Guolin Ke committed
34
35
36
37
      ++idx;
      if (*str == ',') {
        ++str;
      } else if (*str != '\0') {
38
        Log::Fatal("Input format error when parsing as CSV");
Guolin Ke's avatar
Guolin Ke committed
39
40
41
      }
    }
  }
Guolin Ke's avatar
Guolin Ke committed
42
43
44
45

  inline int TotalColumns() const override {
    return total_columns_;
  }
46

Nikita Titov's avatar
Nikita Titov committed
47
 private:
Guolin Ke's avatar
Guolin Ke committed
48
  int label_idx_ = 0;
Guolin Ke's avatar
Guolin Ke committed
49
  int total_columns_ = -1;
Guolin Ke's avatar
Guolin Ke committed
50
51
52
};

class TSVParser: public Parser {
Nikita Titov's avatar
Nikita Titov committed
53
 public:
Guolin Ke's avatar
Guolin Ke committed
54
55
  explicit TSVParser(int label_idx, int total_columns)
    :label_idx_(label_idx), total_columns_(total_columns) {
Guolin Ke's avatar
Guolin Ke committed
56
  }
57
  inline void ParseOneLine(const char* str,
58
    std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
Guolin Ke's avatar
Guolin Ke committed
59
    int idx = 0;
60
    double val = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
61
    int bias = 0;
Guolin Ke's avatar
Guolin Ke committed
62
63
    while (*str != '\0') {
      str = Common::Atof(str, &val);
Guolin Ke's avatar
Guolin Ke committed
64
65
66
      if (idx == label_idx_) {
        *out_label = val;
        bias = -1;
Guolin Ke's avatar
Guolin Ke committed
67
      } else if (std::fabs(val) > kZeroThreshold || std::isnan(val)) {
Guolin Ke's avatar
Guolin Ke committed
68
        out_features->emplace_back(idx + bias, val);
Guolin Ke's avatar
Guolin Ke committed
69
      }
Guolin Ke's avatar
Guolin Ke committed
70
71
72
73
      ++idx;
      if (*str == '\t') {
        ++str;
      } else if (*str != '\0') {
74
        Log::Fatal("Input format error when parsing as TSV");
Guolin Ke's avatar
Guolin Ke committed
75
76
77
      }
    }
  }
Guolin Ke's avatar
Guolin Ke committed
78
79
80
81

  inline int TotalColumns() const override {
    return total_columns_;
  }
82

Nikita Titov's avatar
Nikita Titov committed
83
 private:
Guolin Ke's avatar
Guolin Ke committed
84
  int label_idx_ = 0;
Guolin Ke's avatar
Guolin Ke committed
85
  int total_columns_ = -1;
Guolin Ke's avatar
Guolin Ke committed
86
87
88
};

class LibSVMParser: public Parser {
Nikita Titov's avatar
Nikita Titov committed
89
 public:
Guolin Ke's avatar
Guolin Ke committed
90
91
92
  explicit LibSVMParser(int label_idx)
    :label_idx_(label_idx) {
    if (label_idx > 0) {
93
      Log::Fatal("Label should be the first column in a LibSVM file");
Guolin Ke's avatar
Guolin Ke committed
94
95
    }
  }
96
  inline void ParseOneLine(const char* str,
97
    std::vector<std::pair<int, double>>* out_features, double* out_label) const override {
Guolin Ke's avatar
Guolin Ke committed
98
    int idx = 0;
99
    double val = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
100
101
102
103
104
    if (label_idx_ == 0) {
      str = Common::Atof(str, &val);
      *out_label = val;
      str = Common::SkipSpaceAndTab(str);
    }
Guolin Ke's avatar
Guolin Ke committed
105
106
107
108
109
110
111
112
    while (*str != '\0') {
      str = Common::Atoi(str, &idx);
      str = Common::SkipSpaceAndTab(str);
      if (*str == ':') {
        ++str;
        str = Common::Atof(str, &val);
        out_features->emplace_back(idx, val);
      } else {
113
        Log::Fatal("Input format error when parsing as LibSVM");
Guolin Ke's avatar
Guolin Ke committed
114
115
116
117
      }
      str = Common::SkipSpaceAndTab(str);
    }
  }
Guolin Ke's avatar
Guolin Ke committed
118
119
120
121

  inline int TotalColumns() const override {
    return -1;
  }
122

Nikita Titov's avatar
Nikita Titov committed
123
 private:
Guolin Ke's avatar
Guolin Ke committed
124
  int label_idx_ = 0;
Guolin Ke's avatar
Guolin Ke committed
125
};
Guolin Ke's avatar
Guolin Ke committed
126

Guolin Ke's avatar
Guolin Ke committed
127
}  // namespace LightGBM
Guolin Ke's avatar
Guolin Ke committed
128
#endif   // LightGBM_IO_PARSER_HPP_